aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig35
-rw-r--r--arch/x86/Kconfig.cpu2
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/boot/compressed/misc.c15
-rw-r--r--arch/x86/boot/mkcpustr.c2
-rw-r--r--arch/x86/boot/video-vga.c9
-rw-r--r--arch/x86/boot/video.c7
-rw-r--r--arch/x86/crypto/twofish-i586-asm_32.S10
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S20
-rw-r--r--arch/x86/ia32/ia32_aout.c2
-rw-r--r--arch/x86/ia32/ia32entry.S6
-rw-r--r--arch/x86/ia32/sys_ia32.c76
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/alternative.h16
-rw-r--r--arch/x86/include/asm/apb_timer.h70
-rw-r--r--arch/x86/include/asm/atomic.h299
-rw-r--r--arch/x86/include/asm/atomic64_32.h160
-rw-r--r--arch/x86/include/asm/atomic64_64.h224
-rw-r--r--arch/x86/include/asm/atomic_32.h415
-rw-r--r--arch/x86/include/asm/atomic_64.h485
-rw-r--r--arch/x86/include/asm/compat.h3
-rw-r--r--arch/x86/include/asm/cpufeature.h4
-rw-r--r--arch/x86/include/asm/debugreg.h3
-rw-r--r--arch/x86/include/asm/e820.h5
-rw-r--r--arch/x86/include/asm/elf.h5
-rw-r--r--arch/x86/include/asm/fb.h4
-rw-r--r--arch/x86/include/asm/fixmap.h16
-rw-r--r--arch/x86/include/asm/highmem.h4
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h1
-rw-r--r--arch/x86/include/asm/hw_irq.h7
-rw-r--r--arch/x86/include/asm/hyperv.h186
-rw-r--r--arch/x86/include/asm/i387.h12
-rw-r--r--arch/x86/include/asm/i8259.h21
-rw-r--r--arch/x86/include/asm/io.h155
-rw-r--r--arch/x86/include/asm/io_32.h196
-rw-r--r--arch/x86/include/asm/io_64.h181
-rw-r--r--arch/x86/include/asm/io_apic.h7
-rw-r--r--arch/x86/include/asm/irq_vectors.h48
-rw-r--r--arch/x86/include/asm/kprobes.h31
-rw-r--r--arch/x86/include/asm/kvm_emulate.h17
-rw-r--r--arch/x86/include/asm/kvm_host.h60
-rw-r--r--arch/x86/include/asm/kvm_para.h1
-rw-r--r--arch/x86/include/asm/local.h37
-rw-r--r--arch/x86/include/asm/mmzone_64.h6
-rw-r--r--arch/x86/include/asm/mrst.h19
-rw-r--r--arch/x86/include/asm/nmi.h1
-rw-r--r--arch/x86/include/asm/numa_64.h5
-rw-r--r--arch/x86/include/asm/numaq.h5
-rw-r--r--arch/x86/include/asm/olpc.h20
-rw-r--r--arch/x86/include/asm/page_types.h1
-rw-r--r--arch/x86/include/asm/paravirt.h9
-rw-r--r--arch/x86/include/asm/paravirt_types.h4
-rw-r--r--arch/x86/include/asm/pci.h39
-rw-r--r--arch/x86/include/asm/pci_64.h2
-rw-r--r--arch/x86/include/asm/pci_x86.h23
-rw-r--r--arch/x86/include/asm/percpu.h119
-rw-r--r--arch/x86/include/asm/perf_event.h30
-rw-r--r--arch/x86/include/asm/pgalloc.h5
-rw-r--r--arch/x86/include/asm/pgtable_32.h6
-rw-r--r--arch/x86/include/asm/pgtable_64.h2
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/proto.h10
-rw-r--r--arch/x86/include/asm/ptrace.h11
-rw-r--r--arch/x86/include/asm/rwsem.h81
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/smp.h9
-rw-r--r--arch/x86/include/asm/stacktrace.h2
-rw-r--r--arch/x86/include/asm/svm.h2
-rw-r--r--arch/x86/include/asm/sys_ia32.h11
-rw-r--r--arch/x86/include/asm/syscall.h2
-rw-r--r--arch/x86/include/asm/syscalls.h15
-rw-r--r--arch/x86/include/asm/system.h8
-rw-r--r--arch/x86/include/asm/uaccess_64.h21
-rw-r--r--arch/x86/include/asm/unistd_32.h4
-rw-r--r--arch/x86/include/asm/unistd_64.h3
-rw-r--r--arch/x86/include/asm/user.h58
-rw-r--r--arch/x86/include/asm/uv/bios.h11
-rw-r--r--arch/x86/include/asm/uv/uv.h1
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h3
-rw-r--r--arch/x86/include/asm/visws/cobalt.h2
-rw-r--r--arch/x86/include/asm/vmx.h5
-rw-r--r--arch/x86/include/asm/x86_init.h17
-rw-r--r--arch/x86/include/asm/xsave.h2
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c33
-rw-r--r--arch/x86/kernel/alternative.c82
-rw-r--r--arch/x86/kernel/apb_timer.c784
-rw-r--r--arch/x86/kernel/aperture_64.c1
-rw-r--r--arch/x86/kernel/apic/apic.c10
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c272
-rw-r--r--arch/x86/kernel/apic/nmi.c14
-rw-r--r--arch/x86/kernel/apic/numaq_32.c3
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c89
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/bios_uv.c39
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c620
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c9
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c336
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c12
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c208
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h6
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c94
-rw-r--r--arch/x86/kernel/cpu/perf_event.c1871
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c416
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c980
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c157
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c13
-rw-r--r--arch/x86/kernel/dumpstack_32.c5
-rw-r--r--arch/x86/kernel/dumpstack_64.c16
-rw-r--r--arch/x86/kernel/e820.c357
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/ftrace.c36
-rw-r--r--arch/x86/kernel/head32.c10
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/head_64.S2
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/hw_breakpoint.c38
-rw-r--r--arch/x86/kernel/i387.c71
-rw-r--r--arch/x86/kernel/i8259.c94
-rw-r--r--arch/x86/kernel/irqinit.c36
-rw-r--r--arch/x86/kernel/kprobes.c614
-rw-r--r--arch/x86/kernel/microcode_intel.c2
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c7
-rw-r--r--arch/x86/kernel/mrst.c216
-rw-r--r--arch/x86/kernel/olpc.c10
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c2
-rw-r--r--arch/x86/kernel/pci-dma.c15
-rw-r--r--arch/x86/kernel/process.c7
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c7
-rw-r--r--arch/x86/kernel/ptrace.c67
-rw-r--r--arch/x86/kernel/setup.c31
-rw-r--r--arch/x86/kernel/setup_percpu.c6
-rw-r--r--arch/x86/kernel/smpboot.c23
-rw-r--r--arch/x86/kernel/sys_i386_32.c185
-rw-r--r--arch/x86/kernel/sys_x86_64.c12
-rw-r--r--arch/x86/kernel/syscall_table_32.S4
-rw-r--r--arch/x86/kernel/time.c4
-rw-r--r--arch/x86/kernel/traps.c3
-rw-r--r--arch/x86/kernel/tsc.c6
-rw-r--r--arch/x86/kernel/uv_sysfs.c6
-rw-r--r--arch/x86/kernel/visws_quirks.c27
-rw-r--r--arch/x86/kernel/vmi_32.c35
-rw-r--r--arch/x86/kernel/vmiclock_32.c8
-rw-r--r--arch/x86/kernel/vmlinux.lds.S4
-rw-r--r--arch/x86/kernel/vsyscall_64.c3
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c3
-rw-r--r--arch/x86/kernel/x86_init.c11
-rw-r--r--arch/x86/kernel/xsave.c1
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/emulate.c440
-rw-r--r--arch/x86/kvm/i8254.c26
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/i8259.c46
-rw-r--r--arch/x86/kvm/irq.h3
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h31
-rw-r--r--arch/x86/kvm/lapic.c31
-rw-r--r--arch/x86/kvm/lapic.h8
-rw-r--r--arch/x86/kvm/mmu.c137
-rw-r--r--arch/x86/kvm/mmu.h35
-rw-r--r--arch/x86/kvm/paging_tmpl.h13
-rw-r--r--arch/x86/kvm/svm.c237
-rw-r--r--arch/x86/kvm/trace.h59
-rw-r--r--arch/x86/kvm/vmx.c396
-rw-r--r--arch/x86/kvm/x86.c1105
-rw-r--r--arch/x86/kvm/x86.h30
-rw-r--r--arch/x86/lib/Makefile5
-rw-r--r--arch/x86/lib/cache-smp.c19
-rw-r--r--arch/x86/lib/copy_user_64.S6
-rw-r--r--arch/x86/lib/io_64.c25
-rw-r--r--arch/x86/lib/memcpy_64.S23
-rw-r--r--arch/x86/lib/memset_64.S18
-rw-r--r--arch/x86/lib/rwsem_64.S81
-rw-r--r--arch/x86/mm/init.c7
-rw-r--r--arch/x86/mm/init_32.c15
-rw-r--r--arch/x86/mm/init_64.c28
-rw-r--r--arch/x86/mm/ioremap.c41
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c2
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c16
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h2
-rw-r--r--arch/x86/mm/mmap.c4
-rw-r--r--arch/x86/mm/numa_32.c3
-rw-r--r--arch/x86/mm/numa_64.c332
-rw-r--r--arch/x86/mm/pgtable.c31
-rw-r--r--arch/x86/mm/tlb.c8
-rw-r--r--arch/x86/oprofile/nmi_int.c17
-rw-r--r--arch/x86/oprofile/op_model_amd.c261
-rw-r--r--arch/x86/oprofile/op_model_p4.c6
-rw-r--r--arch/x86/oprofile/op_model_ppro.c21
-rw-r--r--arch/x86/oprofile/op_x86_model.h20
-rw-r--r--arch/x86/pci/Makefile5
-rw-r--r--arch/x86/pci/acpi.c89
-rw-r--r--arch/x86/pci/amd_bus.c127
-rw-r--r--arch/x86/pci/bus_numa.c28
-rw-r--r--arch/x86/pci/bus_numa.h12
-rw-r--r--arch/x86/pci/common.c9
-rw-r--r--arch/x86/pci/i386.c18
-rw-r--r--arch/x86/pci/init.c8
-rw-r--r--arch/x86/pci/irq.c18
-rw-r--r--arch/x86/pci/legacy.c24
-rw-r--r--arch/x86/pci/mmconfig-shared.c17
-rw-r--r--arch/x86/pci/mrst.c262
-rw-r--r--arch/x86/pci/numaq_32.c12
-rw-r--r--arch/x86/pci/olpc.c3
-rw-r--r--arch/x86/pci/visws.c6
-rw-r--r--arch/x86/tools/test_get_len.c4
-rw-r--r--arch/x86/xen/enlighten.c7
-rw-r--r--arch/x86/xen/mmu.c21
-rw-r--r--arch/x86/xen/smp.c2
-rw-r--r--arch/x86/xen/xen-asm_32.S4
222 files changed, 9823 insertions, 5989 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ddb52b8d38a7..0eacb1ffb421 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -31,6 +31,7 @@ config X86
31 select ARCH_WANT_FRAME_POINTERS 31 select ARCH_WANT_FRAME_POINTERS
32 select HAVE_DMA_ATTRS 32 select HAVE_DMA_ATTRS
33 select HAVE_KRETPROBES 33 select HAVE_KRETPROBES
34 select HAVE_OPTPROBES
34 select HAVE_FTRACE_MCOUNT_RECORD 35 select HAVE_FTRACE_MCOUNT_RECORD
35 select HAVE_DYNAMIC_FTRACE 36 select HAVE_DYNAMIC_FTRACE
36 select HAVE_FUNCTION_TRACER 37 select HAVE_FUNCTION_TRACER
@@ -45,6 +46,7 @@ config X86
45 select HAVE_GENERIC_DMA_COHERENT if X86_32 46 select HAVE_GENERIC_DMA_COHERENT if X86_32
46 select HAVE_EFFICIENT_UNALIGNED_ACCESS 47 select HAVE_EFFICIENT_UNALIGNED_ACCESS
47 select USER_STACKTRACE_SUPPORT 48 select USER_STACKTRACE_SUPPORT
49 select HAVE_REGS_AND_STACK_ACCESS_API
48 select HAVE_DMA_API_DEBUG 50 select HAVE_DMA_API_DEBUG
49 select HAVE_KERNEL_GZIP 51 select HAVE_KERNEL_GZIP
50 select HAVE_KERNEL_BZIP2 52 select HAVE_KERNEL_BZIP2
@@ -100,6 +102,9 @@ config ZONE_DMA
100config SBUS 102config SBUS
101 bool 103 bool
102 104
105config NEED_DMA_MAP_STATE
106 def_bool (X86_64 || DMAR || DMA_API_DEBUG)
107
103config GENERIC_ISA_DMA 108config GENERIC_ISA_DMA
104 def_bool y 109 def_bool y
105 110
@@ -183,6 +188,9 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
183config ARCH_SUPPORTS_DEBUG_PAGEALLOC 188config ARCH_SUPPORTS_DEBUG_PAGEALLOC
184 def_bool y 189 def_bool y
185 190
191config HAVE_EARLY_RES
192 def_bool y
193
186config HAVE_INTEL_TXT 194config HAVE_INTEL_TXT
187 def_bool y 195 def_bool y
188 depends on EXPERIMENTAL && DMAR && ACPI 196 depends on EXPERIMENTAL && DMAR && ACPI
@@ -388,8 +396,12 @@ config X86_ELAN
388 396
389config X86_MRST 397config X86_MRST
390 bool "Moorestown MID platform" 398 bool "Moorestown MID platform"
399 depends on PCI
400 depends on PCI_GOANY
391 depends on X86_32 401 depends on X86_32
392 depends on X86_EXTENDED_PLATFORM 402 depends on X86_EXTENDED_PLATFORM
403 depends on X86_IO_APIC
404 select APB_TIMER
393 ---help--- 405 ---help---
394 Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin 406 Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
395 Internet Device(MID) platform. Moorestown consists of two chips: 407 Internet Device(MID) platform. Moorestown consists of two chips:
@@ -424,6 +436,7 @@ config X86_32_NON_STANDARD
424config X86_NUMAQ 436config X86_NUMAQ
425 bool "NUMAQ (IBM/Sequent)" 437 bool "NUMAQ (IBM/Sequent)"
426 depends on X86_32_NON_STANDARD 438 depends on X86_32_NON_STANDARD
439 depends on PCI
427 select NUMA 440 select NUMA
428 select X86_MPPARSE 441 select X86_MPPARSE
429 ---help--- 442 ---help---
@@ -568,6 +581,18 @@ config PARAVIRT_DEBUG
568 Enable to debug paravirt_ops internals. Specifically, BUG if 581 Enable to debug paravirt_ops internals. Specifically, BUG if
569 a paravirt_op is missing when it is called. 582 a paravirt_op is missing when it is called.
570 583
584config NO_BOOTMEM
585 default y
586 bool "Disable Bootmem code"
587 ---help---
588 Use early_res directly instead of bootmem before slab is ready.
589 - allocator (buddy) [generic]
590 - early allocator (bootmem) [generic]
591 - very early allocator (reserve_early*()) [x86]
592 - very very early allocator (early brk model) [x86]
593 So reduce one layer between early allocator to final allocator
594
595
571config MEMTEST 596config MEMTEST
572 bool "Memtest" 597 bool "Memtest"
573 ---help--- 598 ---help---
@@ -612,6 +637,16 @@ config HPET_EMULATE_RTC
612 def_bool y 637 def_bool y
613 depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) 638 depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
614 639
640config APB_TIMER
641 def_bool y if MRST
642 prompt "Langwell APB Timer Support" if X86_MRST
643 help
644 APB timer is the replacement for 8254, HPET on X86 MID platforms.
645 The APBT provides a stable time base on SMP
646 systems, unlike the TSC, but it is more expensive to access,
647 as it is off-chip. APB timers are always running regardless of CPU
648 C states, they are used as per CPU clockevent device when possible.
649
615# Mark as embedded because too many people got it wrong. 650# Mark as embedded because too many people got it wrong.
616# The code disables itself when not needed. 651# The code disables itself when not needed.
617config DMI 652config DMI
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index f20ddf84a893..a19829374e6a 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -319,7 +319,7 @@ config X86_L1_CACHE_SHIFT
319 319
320config X86_XADD 320config X86_XADD
321 def_bool y 321 def_bool y
322 depends on X86_32 && !M386 322 depends on X86_64 || !M386
323 323
324config X86_PPRO_FENCE 324config X86_PPRO_FENCE
325 bool "PentiumPro memory ordering errata workaround" 325 bool "PentiumPro memory ordering errata workaround"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 78b32be55e9e..0a43dc515e4c 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -135,9 +135,7 @@ drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
135# suspend and hibernation support 135# suspend and hibernation support
136drivers-$(CONFIG_PM) += arch/x86/power/ 136drivers-$(CONFIG_PM) += arch/x86/power/
137 137
138ifeq ($(CONFIG_X86_32),y)
139drivers-$(CONFIG_FB) += arch/x86/video/ 138drivers-$(CONFIG_FB) += arch/x86/video/
140endif
141 139
142#### 140####
143# boot loader support. Several targets are kept for legacy purposes 141# boot loader support. Several targets are kept for legacy purposes
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 3b22fe8ab91b..51e240779a44 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -19,11 +19,6 @@
19#define _ASM_X86_DESC_H 1 19#define _ASM_X86_DESC_H 1
20#endif 20#endif
21 21
22#ifdef CONFIG_X86_64
23#define _LINUX_STRING_H_ 1
24#define __LINUX_BITMAP_H 1
25#endif
26
27#include <linux/linkage.h> 22#include <linux/linkage.h>
28#include <linux/screen_info.h> 23#include <linux/screen_info.h>
29#include <linux/elf.h> 24#include <linux/elf.h>
@@ -131,8 +126,8 @@ static void error(char *m);
131static struct boot_params *real_mode; /* Pointer to real-mode data */ 126static struct boot_params *real_mode; /* Pointer to real-mode data */
132static int quiet; 127static int quiet;
133 128
134static void *memset(void *s, int c, unsigned n); 129void *memset(void *s, int c, size_t n);
135void *memcpy(void *dest, const void *src, unsigned n); 130void *memcpy(void *dest, const void *src, size_t n);
136 131
137static void __putstr(int, const char *); 132static void __putstr(int, const char *);
138#define putstr(__x) __putstr(0, __x) 133#define putstr(__x) __putstr(0, __x)
@@ -185,11 +180,9 @@ static void __putstr(int error, const char *s)
185 return; 180 return;
186#endif 181#endif
187 182
188#ifdef CONFIG_X86_32
189 if (real_mode->screen_info.orig_video_mode == 0 && 183 if (real_mode->screen_info.orig_video_mode == 0 &&
190 lines == 0 && cols == 0) 184 lines == 0 && cols == 0)
191 return; 185 return;
192#endif
193 186
194 x = real_mode->screen_info.orig_x; 187 x = real_mode->screen_info.orig_x;
195 y = real_mode->screen_info.orig_y; 188 y = real_mode->screen_info.orig_y;
@@ -223,7 +216,7 @@ static void __putstr(int error, const char *s)
223 outb(0xff & (pos >> 1), vidport+1); 216 outb(0xff & (pos >> 1), vidport+1);
224} 217}
225 218
226static void *memset(void *s, int c, unsigned n) 219void *memset(void *s, int c, size_t n)
227{ 220{
228 int i; 221 int i;
229 char *ss = s; 222 char *ss = s;
@@ -233,7 +226,7 @@ static void *memset(void *s, int c, unsigned n)
233 return s; 226 return s;
234} 227}
235 228
236void *memcpy(void *dest, const void *src, unsigned n) 229void *memcpy(void *dest, const void *src, size_t n)
237{ 230{
238 int i; 231 int i;
239 const char *s = src; 232 const char *s = src;
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index 8ef60f20b371..919257f526f2 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -22,7 +22,7 @@ int main(void)
22 int i, j; 22 int i, j;
23 const char *str; 23 const char *str;
24 24
25 printf("static const char x86_cap_strs[] = \n"); 25 printf("static const char x86_cap_strs[] =\n");
26 26
27 for (i = 0; i < NCAPINTS; i++) { 27 for (i = 0; i < NCAPINTS; i++) {
28 for (j = 0; j < 32; j++) { 28 for (j = 0; j < 32; j++) {
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 819caa1f2008..ed7aeff786b2 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -42,22 +42,15 @@ static u8 vga_set_basic_mode(void)
42{ 42{
43 struct biosregs ireg, oreg; 43 struct biosregs ireg, oreg;
44 u16 ax; 44 u16 ax;
45 u8 rows;
46 u8 mode; 45 u8 mode;
47 46
48 initregs(&ireg); 47 initregs(&ireg);
49 48
49 /* Query current mode */
50 ax = 0x0f00; 50 ax = 0x0f00;
51 intcall(0x10, &ireg, &oreg); 51 intcall(0x10, &ireg, &oreg);
52 mode = oreg.al; 52 mode = oreg.al;
53 53
54 set_fs(0);
55 rows = rdfs8(0x484); /* rows minus one */
56
57 if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
58 (rows == 0 || rows == 24))
59 return mode;
60
61 if (mode != 3 && mode != 7) 54 if (mode != 3 && mode != 7)
62 mode = 3; 55 mode = 3;
63 56
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index f767164cd5df..43eda284d27f 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -298,11 +298,18 @@ static void restore_screen(void)
298 } 298 }
299 299
300 /* Restore cursor position */ 300 /* Restore cursor position */
301 if (saved.curx >= xs)
302 saved.curx = xs-1;
303 if (saved.cury >= ys)
304 saved.cury = ys-1;
305
301 initregs(&ireg); 306 initregs(&ireg);
302 ireg.ah = 0x02; /* Set cursor position */ 307 ireg.ah = 0x02; /* Set cursor position */
303 ireg.dh = saved.cury; 308 ireg.dh = saved.cury;
304 ireg.dl = saved.curx; 309 ireg.dl = saved.curx;
305 intcall(0x10, &ireg, NULL); 310 intcall(0x10, &ireg, NULL);
311
312 store_cursor_position();
306} 313}
307 314
308void set_video(void) 315void set_video(void)
diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S
index 39b98ed2c1b9..575331cb2a8a 100644
--- a/arch/x86/crypto/twofish-i586-asm_32.S
+++ b/arch/x86/crypto/twofish-i586-asm_32.S
@@ -22,7 +22,7 @@
22 22
23#include <asm/asm-offsets.h> 23#include <asm/asm-offsets.h>
24 24
25/* return adress at 0 */ 25/* return address at 0 */
26 26
27#define in_blk 12 /* input byte array address parameter*/ 27#define in_blk 12 /* input byte array address parameter*/
28#define out_blk 8 /* output byte array address parameter*/ 28#define out_blk 8 /* output byte array address parameter*/
@@ -230,8 +230,8 @@ twofish_enc_blk:
230 push %edi 230 push %edi
231 231
232 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */ 232 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
233 add $crypto_tfm_ctx_offset, %ebp /* ctx adress */ 233 add $crypto_tfm_ctx_offset, %ebp /* ctx address */
234 mov in_blk+16(%esp),%edi /* input adress in edi */ 234 mov in_blk+16(%esp),%edi /* input address in edi */
235 235
236 mov (%edi), %eax 236 mov (%edi), %eax
237 mov b_offset(%edi), %ebx 237 mov b_offset(%edi), %ebx
@@ -286,8 +286,8 @@ twofish_dec_blk:
286 286
287 287
288 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */ 288 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
289 add $crypto_tfm_ctx_offset, %ebp /* ctx adress */ 289 add $crypto_tfm_ctx_offset, %ebp /* ctx address */
290 mov in_blk+16(%esp),%edi /* input adress in edi */ 290 mov in_blk+16(%esp),%edi /* input address in edi */
291 291
292 mov (%edi), %eax 292 mov (%edi), %eax
293 mov b_offset(%edi), %ebx 293 mov b_offset(%edi), %ebx
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index 35974a586615..573aa102542e 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -221,11 +221,11 @@
221twofish_enc_blk: 221twofish_enc_blk:
222 pushq R1 222 pushq R1
223 223
224 /* %rdi contains the crypto tfm adress */ 224 /* %rdi contains the crypto tfm address */
225 /* %rsi contains the output adress */ 225 /* %rsi contains the output address */
226 /* %rdx contains the input adress */ 226 /* %rdx contains the input address */
227 add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */ 227 add $crypto_tfm_ctx_offset, %rdi /* set ctx address */
228 /* ctx adress is moved to free one non-rex register 228 /* ctx address is moved to free one non-rex register
229 as target for the 8bit high operations */ 229 as target for the 8bit high operations */
230 mov %rdi, %r11 230 mov %rdi, %r11
231 231
@@ -274,11 +274,11 @@ twofish_enc_blk:
274twofish_dec_blk: 274twofish_dec_blk:
275 pushq R1 275 pushq R1
276 276
277 /* %rdi contains the crypto tfm adress */ 277 /* %rdi contains the crypto tfm address */
278 /* %rsi contains the output adress */ 278 /* %rsi contains the output address */
279 /* %rdx contains the input adress */ 279 /* %rdx contains the input address */
280 add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */ 280 add $crypto_tfm_ctx_offset, %rdi /* set ctx address */
281 /* ctx adress is moved to free one non-rex register 281 /* ctx address is moved to free one non-rex register
282 as target for the 8bit high operations */ 282 as target for the 8bit high operations */
283 mov %rdi, %r11 283 mov %rdi, %r11
284 284
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 14531abdd0ce..280c019cfad8 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -297,7 +297,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
297 * size limits imposed on them by creating programs with large 297 * size limits imposed on them by creating programs with large
298 * arrays in the data or bss. 298 * arrays in the data or bss.
299 */ 299 */
300 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; 300 rlim = rlimit(RLIMIT_DATA);
301 if (rlim >= RLIM_INFINITY) 301 if (rlim >= RLIM_INFINITY)
302 rlim = ~0; 302 rlim = ~0;
303 if (ex.a_data + ex.a_bss > rlim) 303 if (ex.a_data + ex.a_bss > rlim)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 53147ad85b96..59b4556a5b92 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -563,7 +563,7 @@ ia32_sys_call_table:
563 .quad quiet_ni_syscall /* old mpx syscall holder */ 563 .quad quiet_ni_syscall /* old mpx syscall holder */
564 .quad sys_setpgid 564 .quad sys_setpgid
565 .quad quiet_ni_syscall /* old ulimit syscall holder */ 565 .quad quiet_ni_syscall /* old ulimit syscall holder */
566 .quad sys32_olduname 566 .quad sys_olduname
567 .quad sys_umask /* 60 */ 567 .quad sys_umask /* 60 */
568 .quad sys_chroot 568 .quad sys_chroot
569 .quad compat_sys_ustat 569 .quad compat_sys_ustat
@@ -586,7 +586,7 @@ ia32_sys_call_table:
586 .quad compat_sys_settimeofday 586 .quad compat_sys_settimeofday
587 .quad sys_getgroups16 /* 80 */ 587 .quad sys_getgroups16 /* 80 */
588 .quad sys_setgroups16 588 .quad sys_setgroups16
589 .quad sys32_old_select 589 .quad compat_sys_old_select
590 .quad sys_symlink 590 .quad sys_symlink
591 .quad sys_lstat 591 .quad sys_lstat
592 .quad sys_readlink /* 85 */ 592 .quad sys_readlink /* 85 */
@@ -613,7 +613,7 @@ ia32_sys_call_table:
613 .quad compat_sys_newstat 613 .quad compat_sys_newstat
614 .quad compat_sys_newlstat 614 .quad compat_sys_newlstat
615 .quad compat_sys_newfstat 615 .quad compat_sys_newfstat
616 .quad sys32_uname 616 .quad sys_uname
617 .quad stub32_iopl /* 110 */ 617 .quad stub32_iopl /* 110 */
618 .quad sys_vhangup 618 .quad sys_vhangup
619 .quad quiet_ni_syscall /* old "idle" system call */ 619 .quad quiet_ni_syscall /* old "idle" system call */
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 422572c77923..74c35431b7d8 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -143,7 +143,7 @@ asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,
143 * block for parameter passing.. 143 * block for parameter passing..
144 */ 144 */
145 145
146struct mmap_arg_struct { 146struct mmap_arg_struct32 {
147 unsigned int addr; 147 unsigned int addr;
148 unsigned int len; 148 unsigned int len;
149 unsigned int prot; 149 unsigned int prot;
@@ -152,9 +152,9 @@ struct mmap_arg_struct {
152 unsigned int offset; 152 unsigned int offset;
153}; 153};
154 154
155asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) 155asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *arg)
156{ 156{
157 struct mmap_arg_struct a; 157 struct mmap_arg_struct32 a;
158 158
159 if (copy_from_user(&a, arg, sizeof(a))) 159 if (copy_from_user(&a, arg, sizeof(a)))
160 return -EFAULT; 160 return -EFAULT;
@@ -332,24 +332,6 @@ asmlinkage long sys32_alarm(unsigned int seconds)
332 return alarm_setitimer(seconds); 332 return alarm_setitimer(seconds);
333} 333}
334 334
335struct sel_arg_struct {
336 unsigned int n;
337 unsigned int inp;
338 unsigned int outp;
339 unsigned int exp;
340 unsigned int tvp;
341};
342
343asmlinkage long sys32_old_select(struct sel_arg_struct __user *arg)
344{
345 struct sel_arg_struct a;
346
347 if (copy_from_user(&a, arg, sizeof(a)))
348 return -EFAULT;
349 return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
350 compat_ptr(a.exp), compat_ptr(a.tvp));
351}
352
353asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, 335asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr,
354 int options) 336 int options)
355{ 337{
@@ -466,58 +448,6 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd,
466 return ret; 448 return ret;
467} 449}
468 450
469asmlinkage long sys32_olduname(struct oldold_utsname __user *name)
470{
471 char *arch = "x86_64";
472 int err;
473
474 if (!name)
475 return -EFAULT;
476 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
477 return -EFAULT;
478
479 down_read(&uts_sem);
480
481 err = __copy_to_user(&name->sysname, &utsname()->sysname,
482 __OLD_UTS_LEN);
483 err |= __put_user(0, name->sysname+__OLD_UTS_LEN);
484 err |= __copy_to_user(&name->nodename, &utsname()->nodename,
485 __OLD_UTS_LEN);
486 err |= __put_user(0, name->nodename+__OLD_UTS_LEN);
487 err |= __copy_to_user(&name->release, &utsname()->release,
488 __OLD_UTS_LEN);
489 err |= __put_user(0, name->release+__OLD_UTS_LEN);
490 err |= __copy_to_user(&name->version, &utsname()->version,
491 __OLD_UTS_LEN);
492 err |= __put_user(0, name->version+__OLD_UTS_LEN);
493
494 if (personality(current->personality) == PER_LINUX32)
495 arch = "i686";
496
497 err |= __copy_to_user(&name->machine, arch, strlen(arch) + 1);
498
499 up_read(&uts_sem);
500
501 err = err ? -EFAULT : 0;
502
503 return err;
504}
505
506long sys32_uname(struct old_utsname __user *name)
507{
508 int err;
509
510 if (!name)
511 return -EFAULT;
512 down_read(&uts_sem);
513 err = copy_to_user(name, utsname(), sizeof(*name));
514 up_read(&uts_sem);
515 if (personality(current->personality) == PER_LINUX32)
516 err |= copy_to_user(&name->machine, "i686", 5);
517
518 return err ? -EFAULT : 0;
519}
520
521asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, 451asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
522 compat_uptr_t __user *envp, struct pt_regs *regs) 452 compat_uptr_t __user *envp, struct pt_regs *regs)
523{ 453{
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 9f828f87ca35..493092efaa3b 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -11,6 +11,7 @@ header-y += sigcontext32.h
11header-y += ucontext.h 11header-y += ucontext.h
12header-y += processor-flags.h 12header-y += processor-flags.h
13header-y += hw_breakpoint.h 13header-y += hw_breakpoint.h
14header-y += hyperv.h
14 15
15unifdef-y += e820.h 16unifdef-y += e820.h
16unifdef-y += ist.h 17unifdef-y += ist.h
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 69b74a7b877f..b09ec55650b3 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -65,12 +65,17 @@ extern void alternatives_smp_module_add(struct module *mod, char *name,
65 void *text, void *text_end); 65 void *text, void *text_end);
66extern void alternatives_smp_module_del(struct module *mod); 66extern void alternatives_smp_module_del(struct module *mod);
67extern void alternatives_smp_switch(int smp); 67extern void alternatives_smp_switch(int smp);
68extern int alternatives_text_reserved(void *start, void *end);
68#else 69#else
69static inline void alternatives_smp_module_add(struct module *mod, char *name, 70static inline void alternatives_smp_module_add(struct module *mod, char *name,
70 void *locks, void *locks_end, 71 void *locks, void *locks_end,
71 void *text, void *text_end) {} 72 void *text, void *text_end) {}
72static inline void alternatives_smp_module_del(struct module *mod) {} 73static inline void alternatives_smp_module_del(struct module *mod) {}
73static inline void alternatives_smp_switch(int smp) {} 74static inline void alternatives_smp_switch(int smp) {}
75static inline int alternatives_text_reserved(void *start, void *end)
76{
77 return 0;
78}
74#endif /* CONFIG_SMP */ 79#endif /* CONFIG_SMP */
75 80
76/* alternative assembly primitive: */ 81/* alternative assembly primitive: */
@@ -125,11 +130,16 @@ static inline void alternatives_smp_switch(int smp) {}
125 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ 130 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
126 : output : "i" (0), ## input) 131 : output : "i" (0), ## input)
127 132
133/* Like alternative_io, but for replacing a direct call with another one. */
134#define alternative_call(oldfunc, newfunc, feature, output, input...) \
135 asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \
136 : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
137
128/* 138/*
129 * use this macro(s) if you need more than one output parameter 139 * use this macro(s) if you need more than one output parameter
130 * in alternative_io 140 * in alternative_io
131 */ 141 */
132#define ASM_OUTPUT2(a, b) a, b 142#define ASM_OUTPUT2(a...) a
133 143
134struct paravirt_patch_site; 144struct paravirt_patch_site;
135#ifdef CONFIG_PARAVIRT 145#ifdef CONFIG_PARAVIRT
@@ -155,10 +165,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
155 * invalid instruction possible) or if the instructions are changed from a 165 * invalid instruction possible) or if the instructions are changed from a
156 * consistent state to another consistent state atomically. 166 * consistent state to another consistent state atomically.
157 * More care must be taken when modifying code in the SMP case because of 167 * More care must be taken when modifying code in the SMP case because of
158 * Intel's errata. 168 * Intel's errata. text_poke_smp() takes care that errata, but still
169 * doesn't support NMI/MCE handler code modifying.
159 * On the local CPU you need to be protected again NMI or MCE handlers seeing an 170 * On the local CPU you need to be protected again NMI or MCE handlers seeing an
160 * inconsistent instruction while you patch. 171 * inconsistent instruction while you patch.
161 */ 172 */
162extern void *text_poke(void *addr, const void *opcode, size_t len); 173extern void *text_poke(void *addr, const void *opcode, size_t len);
174extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
163 175
164#endif /* _ASM_X86_ALTERNATIVE_H */ 176#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
new file mode 100644
index 000000000000..c74a2eebe570
--- /dev/null
+++ b/arch/x86/include/asm/apb_timer.h
@@ -0,0 +1,70 @@
1/*
2 * apb_timer.h: Driver for Langwell APB timer based on Synopsis DesignWare
3 *
4 * (C) Copyright 2009 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 *
12 * Note:
13 */
14
15#ifndef ASM_X86_APBT_H
16#define ASM_X86_APBT_H
17#include <linux/sfi.h>
18
19#ifdef CONFIG_APB_TIMER
20
21/* Langwell DW APB timer registers */
22#define APBTMR_N_LOAD_COUNT 0x00
23#define APBTMR_N_CURRENT_VALUE 0x04
24#define APBTMR_N_CONTROL 0x08
25#define APBTMR_N_EOI 0x0c
26#define APBTMR_N_INT_STATUS 0x10
27
28#define APBTMRS_INT_STATUS 0xa0
29#define APBTMRS_EOI 0xa4
30#define APBTMRS_RAW_INT_STATUS 0xa8
31#define APBTMRS_COMP_VERSION 0xac
32#define APBTMRS_REG_SIZE 0x14
33
34/* register bits */
35#define APBTMR_CONTROL_ENABLE (1<<0)
36#define APBTMR_CONTROL_MODE_PERIODIC (1<<1) /*1: periodic 0:free running */
37#define APBTMR_CONTROL_INT (1<<2)
38
39/* default memory mapped register base */
40#define LNW_SCU_ADDR 0xFF100000
41#define LNW_EXT_TIMER_OFFSET 0x1B800
42#define APBT_DEFAULT_BASE (LNW_SCU_ADDR+LNW_EXT_TIMER_OFFSET)
43#define LNW_EXT_TIMER_PGOFFSET 0x800
44
45/* APBT clock speed range from PCLK to fabric base, 25-100MHz */
46#define APBT_MAX_FREQ 50
47#define APBT_MIN_FREQ 1
48#define APBT_MMAP_SIZE 1024
49
50#define APBT_DEV_USED 1
51
52extern void apbt_time_init(void);
53extern struct clock_event_device *global_clock_event;
54extern unsigned long apbt_quick_calibrate(void);
55extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
56extern void apbt_setup_secondary_clock(void);
57extern unsigned int boot_cpu_id;
58extern int disable_apbt_percpu;
59
60extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
61extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
62extern int sfi_mtimer_num;
63
64#else /* CONFIG_APB_TIMER */
65
66static inline unsigned long apbt_quick_calibrate(void) {return 0; }
67static inline void apbt_time_init(void) {return 0; }
68
69#endif
70#endif /* ASM_X86_APBT_H */
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 4e1b8873c474..8f8217b9bdac 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -1,5 +1,300 @@
1#ifndef _ASM_X86_ATOMIC_H
2#define _ASM_X86_ATOMIC_H
3
4#include <linux/compiler.h>
5#include <linux/types.h>
6#include <asm/processor.h>
7#include <asm/alternative.h>
8#include <asm/cmpxchg.h>
9
10/*
11 * Atomic operations that C can't guarantee us. Useful for
12 * resource counting etc..
13 */
14
15#define ATOMIC_INIT(i) { (i) }
16
17/**
18 * atomic_read - read atomic variable
19 * @v: pointer of type atomic_t
20 *
21 * Atomically reads the value of @v.
22 */
23static inline int atomic_read(const atomic_t *v)
24{
25 return v->counter;
26}
27
28/**
29 * atomic_set - set atomic variable
30 * @v: pointer of type atomic_t
31 * @i: required value
32 *
33 * Atomically sets the value of @v to @i.
34 */
35static inline void atomic_set(atomic_t *v, int i)
36{
37 v->counter = i;
38}
39
40/**
41 * atomic_add - add integer to atomic variable
42 * @i: integer value to add
43 * @v: pointer of type atomic_t
44 *
45 * Atomically adds @i to @v.
46 */
47static inline void atomic_add(int i, atomic_t *v)
48{
49 asm volatile(LOCK_PREFIX "addl %1,%0"
50 : "+m" (v->counter)
51 : "ir" (i));
52}
53
54/**
55 * atomic_sub - subtract integer from atomic variable
56 * @i: integer value to subtract
57 * @v: pointer of type atomic_t
58 *
59 * Atomically subtracts @i from @v.
60 */
61static inline void atomic_sub(int i, atomic_t *v)
62{
63 asm volatile(LOCK_PREFIX "subl %1,%0"
64 : "+m" (v->counter)
65 : "ir" (i));
66}
67
68/**
69 * atomic_sub_and_test - subtract value from variable and test result
70 * @i: integer value to subtract
71 * @v: pointer of type atomic_t
72 *
73 * Atomically subtracts @i from @v and returns
74 * true if the result is zero, or false for all
75 * other cases.
76 */
77static inline int atomic_sub_and_test(int i, atomic_t *v)
78{
79 unsigned char c;
80
81 asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
82 : "+m" (v->counter), "=qm" (c)
83 : "ir" (i) : "memory");
84 return c;
85}
86
87/**
88 * atomic_inc - increment atomic variable
89 * @v: pointer of type atomic_t
90 *
91 * Atomically increments @v by 1.
92 */
93static inline void atomic_inc(atomic_t *v)
94{
95 asm volatile(LOCK_PREFIX "incl %0"
96 : "+m" (v->counter));
97}
98
99/**
100 * atomic_dec - decrement atomic variable
101 * @v: pointer of type atomic_t
102 *
103 * Atomically decrements @v by 1.
104 */
105static inline void atomic_dec(atomic_t *v)
106{
107 asm volatile(LOCK_PREFIX "decl %0"
108 : "+m" (v->counter));
109}
110
111/**
112 * atomic_dec_and_test - decrement and test
113 * @v: pointer of type atomic_t
114 *
115 * Atomically decrements @v by 1 and
116 * returns true if the result is 0, or false for all other
117 * cases.
118 */
119static inline int atomic_dec_and_test(atomic_t *v)
120{
121 unsigned char c;
122
123 asm volatile(LOCK_PREFIX "decl %0; sete %1"
124 : "+m" (v->counter), "=qm" (c)
125 : : "memory");
126 return c != 0;
127}
128
129/**
130 * atomic_inc_and_test - increment and test
131 * @v: pointer of type atomic_t
132 *
133 * Atomically increments @v by 1
134 * and returns true if the result is zero, or false for all
135 * other cases.
136 */
137static inline int atomic_inc_and_test(atomic_t *v)
138{
139 unsigned char c;
140
141 asm volatile(LOCK_PREFIX "incl %0; sete %1"
142 : "+m" (v->counter), "=qm" (c)
143 : : "memory");
144 return c != 0;
145}
146
147/**
148 * atomic_add_negative - add and test if negative
149 * @i: integer value to add
150 * @v: pointer of type atomic_t
151 *
152 * Atomically adds @i to @v and returns true
153 * if the result is negative, or false when
154 * result is greater than or equal to zero.
155 */
156static inline int atomic_add_negative(int i, atomic_t *v)
157{
158 unsigned char c;
159
160 asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
161 : "+m" (v->counter), "=qm" (c)
162 : "ir" (i) : "memory");
163 return c;
164}
165
166/**
167 * atomic_add_return - add integer and return
168 * @i: integer value to add
169 * @v: pointer of type atomic_t
170 *
171 * Atomically adds @i to @v and returns @i + @v
172 */
173static inline int atomic_add_return(int i, atomic_t *v)
174{
175 int __i;
176#ifdef CONFIG_M386
177 unsigned long flags;
178 if (unlikely(boot_cpu_data.x86 <= 3))
179 goto no_xadd;
180#endif
181 /* Modern 486+ processor */
182 __i = i;
183 asm volatile(LOCK_PREFIX "xaddl %0, %1"
184 : "+r" (i), "+m" (v->counter)
185 : : "memory");
186 return i + __i;
187
188#ifdef CONFIG_M386
189no_xadd: /* Legacy 386 processor */
190 raw_local_irq_save(flags);
191 __i = atomic_read(v);
192 atomic_set(v, i + __i);
193 raw_local_irq_restore(flags);
194 return i + __i;
195#endif
196}
197
198/**
199 * atomic_sub_return - subtract integer and return
200 * @v: pointer of type atomic_t
201 * @i: integer value to subtract
202 *
203 * Atomically subtracts @i from @v and returns @v - @i
204 */
205static inline int atomic_sub_return(int i, atomic_t *v)
206{
207 return atomic_add_return(-i, v);
208}
209
210#define atomic_inc_return(v) (atomic_add_return(1, v))
211#define atomic_dec_return(v) (atomic_sub_return(1, v))
212
213static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
214{
215 return cmpxchg(&v->counter, old, new);
216}
217
218static inline int atomic_xchg(atomic_t *v, int new)
219{
220 return xchg(&v->counter, new);
221}
222
223/**
224 * atomic_add_unless - add unless the number is already a given value
225 * @v: pointer of type atomic_t
226 * @a: the amount to add to v...
227 * @u: ...unless v is equal to u.
228 *
229 * Atomically adds @a to @v, so long as @v was not already @u.
230 * Returns non-zero if @v was not @u, and zero otherwise.
231 */
232static inline int atomic_add_unless(atomic_t *v, int a, int u)
233{
234 int c, old;
235 c = atomic_read(v);
236 for (;;) {
237 if (unlikely(c == (u)))
238 break;
239 old = atomic_cmpxchg((v), c, c + (a));
240 if (likely(old == c))
241 break;
242 c = old;
243 }
244 return c != (u);
245}
246
247#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
248
249/**
250 * atomic_inc_short - increment of a short integer
251 * @v: pointer to type int
252 *
253 * Atomically adds 1 to @v
254 * Returns the new value of @u
255 */
256static inline short int atomic_inc_short(short int *v)
257{
258 asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
259 return *v;
260}
261
262#ifdef CONFIG_X86_64
263/**
264 * atomic_or_long - OR of two long integers
265 * @v1: pointer to type unsigned long
266 * @v2: pointer to type unsigned long
267 *
268 * Atomically ORs @v1 and @v2
269 * Returns the result of the OR
270 */
271static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
272{
273 asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2));
274}
275#endif
276
277/* These are x86-specific, used by some header files */
278#define atomic_clear_mask(mask, addr) \
279 asm volatile(LOCK_PREFIX "andl %0,%1" \
280 : : "r" (~(mask)), "m" (*(addr)) : "memory")
281
282#define atomic_set_mask(mask, addr) \
283 asm volatile(LOCK_PREFIX "orl %0,%1" \
284 : : "r" ((unsigned)(mask)), "m" (*(addr)) \
285 : "memory")
286
287/* Atomic operations are already serializing on x86 */
288#define smp_mb__before_atomic_dec() barrier()
289#define smp_mb__after_atomic_dec() barrier()
290#define smp_mb__before_atomic_inc() barrier()
291#define smp_mb__after_atomic_inc() barrier()
292
1#ifdef CONFIG_X86_32 293#ifdef CONFIG_X86_32
2# include "atomic_32.h" 294# include "atomic64_32.h"
3#else 295#else
4# include "atomic_64.h" 296# include "atomic64_64.h"
5#endif 297#endif
298
299#include <asm-generic/atomic-long.h>
300#endif /* _ASM_X86_ATOMIC_H */
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
new file mode 100644
index 000000000000..03027bf28de5
--- /dev/null
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -0,0 +1,160 @@
1#ifndef _ASM_X86_ATOMIC64_32_H
2#define _ASM_X86_ATOMIC64_32_H
3
4#include <linux/compiler.h>
5#include <linux/types.h>
6#include <asm/processor.h>
7//#include <asm/cmpxchg.h>
8
9/* An 64bit atomic type */
10
11typedef struct {
12 u64 __aligned(8) counter;
13} atomic64_t;
14
15#define ATOMIC64_INIT(val) { (val) }
16
17extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
18
19/**
20 * atomic64_xchg - xchg atomic64 variable
21 * @ptr: pointer to type atomic64_t
22 * @new_val: value to assign
23 *
24 * Atomically xchgs the value of @ptr to @new_val and returns
25 * the old value.
26 */
27extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
28
29/**
30 * atomic64_set - set atomic64 variable
31 * @ptr: pointer to type atomic64_t
32 * @new_val: value to assign
33 *
34 * Atomically sets the value of @ptr to @new_val.
35 */
36extern void atomic64_set(atomic64_t *ptr, u64 new_val);
37
38/**
39 * atomic64_read - read atomic64 variable
40 * @ptr: pointer to type atomic64_t
41 *
42 * Atomically reads the value of @ptr and returns it.
43 */
44static inline u64 atomic64_read(atomic64_t *ptr)
45{
46 u64 res;
47
48 /*
49 * Note, we inline this atomic64_t primitive because
50 * it only clobbers EAX/EDX and leaves the others
51 * untouched. We also (somewhat subtly) rely on the
52 * fact that cmpxchg8b returns the current 64-bit value
53 * of the memory location we are touching:
54 */
55 asm volatile(
56 "mov %%ebx, %%eax\n\t"
57 "mov %%ecx, %%edx\n\t"
58 LOCK_PREFIX "cmpxchg8b %1\n"
59 : "=&A" (res)
60 : "m" (*ptr)
61 );
62
63 return res;
64}
65
66extern u64 atomic64_read(atomic64_t *ptr);
67
68/**
69 * atomic64_add_return - add and return
70 * @delta: integer value to add
71 * @ptr: pointer to type atomic64_t
72 *
73 * Atomically adds @delta to @ptr and returns @delta + *@ptr
74 */
75extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
76
77/*
78 * Other variants with different arithmetic operators:
79 */
80extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
81extern u64 atomic64_inc_return(atomic64_t *ptr);
82extern u64 atomic64_dec_return(atomic64_t *ptr);
83
84/**
85 * atomic64_add - add integer to atomic64 variable
86 * @delta: integer value to add
87 * @ptr: pointer to type atomic64_t
88 *
89 * Atomically adds @delta to @ptr.
90 */
91extern void atomic64_add(u64 delta, atomic64_t *ptr);
92
93/**
94 * atomic64_sub - subtract the atomic64 variable
95 * @delta: integer value to subtract
96 * @ptr: pointer to type atomic64_t
97 *
98 * Atomically subtracts @delta from @ptr.
99 */
100extern void atomic64_sub(u64 delta, atomic64_t *ptr);
101
102/**
103 * atomic64_sub_and_test - subtract value from variable and test result
104 * @delta: integer value to subtract
105 * @ptr: pointer to type atomic64_t
106 *
107 * Atomically subtracts @delta from @ptr and returns
108 * true if the result is zero, or false for all
109 * other cases.
110 */
111extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
112
113/**
114 * atomic64_inc - increment atomic64 variable
115 * @ptr: pointer to type atomic64_t
116 *
117 * Atomically increments @ptr by 1.
118 */
119extern void atomic64_inc(atomic64_t *ptr);
120
121/**
122 * atomic64_dec - decrement atomic64 variable
123 * @ptr: pointer to type atomic64_t
124 *
125 * Atomically decrements @ptr by 1.
126 */
127extern void atomic64_dec(atomic64_t *ptr);
128
129/**
130 * atomic64_dec_and_test - decrement and test
131 * @ptr: pointer to type atomic64_t
132 *
133 * Atomically decrements @ptr by 1 and
134 * returns true if the result is 0, or false for all other
135 * cases.
136 */
137extern int atomic64_dec_and_test(atomic64_t *ptr);
138
139/**
140 * atomic64_inc_and_test - increment and test
141 * @ptr: pointer to type atomic64_t
142 *
143 * Atomically increments @ptr by 1
144 * and returns true if the result is zero, or false for all
145 * other cases.
146 */
147extern int atomic64_inc_and_test(atomic64_t *ptr);
148
149/**
150 * atomic64_add_negative - add and test if negative
151 * @delta: integer value to add
152 * @ptr: pointer to type atomic64_t
153 *
154 * Atomically adds @delta to @ptr and returns true
155 * if the result is negative, or false when
156 * result is greater than or equal to zero.
157 */
158extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
159
160#endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
new file mode 100644
index 000000000000..51c5b4056929
--- /dev/null
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -0,0 +1,224 @@
1#ifndef _ASM_X86_ATOMIC64_64_H
2#define _ASM_X86_ATOMIC64_64_H
3
4#include <linux/types.h>
5#include <asm/alternative.h>
6#include <asm/cmpxchg.h>
7
8/* The 64-bit atomic type */
9
10#define ATOMIC64_INIT(i) { (i) }
11
12/**
13 * atomic64_read - read atomic64 variable
14 * @v: pointer of type atomic64_t
15 *
16 * Atomically reads the value of @v.
17 * Doesn't imply a read memory barrier.
18 */
19static inline long atomic64_read(const atomic64_t *v)
20{
21 return v->counter;
22}
23
24/**
25 * atomic64_set - set atomic64 variable
26 * @v: pointer to type atomic64_t
27 * @i: required value
28 *
29 * Atomically sets the value of @v to @i.
30 */
31static inline void atomic64_set(atomic64_t *v, long i)
32{
33 v->counter = i;
34}
35
36/**
37 * atomic64_add - add integer to atomic64 variable
38 * @i: integer value to add
39 * @v: pointer to type atomic64_t
40 *
41 * Atomically adds @i to @v.
42 */
43static inline void atomic64_add(long i, atomic64_t *v)
44{
45 asm volatile(LOCK_PREFIX "addq %1,%0"
46 : "=m" (v->counter)
47 : "er" (i), "m" (v->counter));
48}
49
50/**
51 * atomic64_sub - subtract the atomic64 variable
52 * @i: integer value to subtract
53 * @v: pointer to type atomic64_t
54 *
55 * Atomically subtracts @i from @v.
56 */
57static inline void atomic64_sub(long i, atomic64_t *v)
58{
59 asm volatile(LOCK_PREFIX "subq %1,%0"
60 : "=m" (v->counter)
61 : "er" (i), "m" (v->counter));
62}
63
64/**
65 * atomic64_sub_and_test - subtract value from variable and test result
66 * @i: integer value to subtract
67 * @v: pointer to type atomic64_t
68 *
69 * Atomically subtracts @i from @v and returns
70 * true if the result is zero, or false for all
71 * other cases.
72 */
73static inline int atomic64_sub_and_test(long i, atomic64_t *v)
74{
75 unsigned char c;
76
77 asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
78 : "=m" (v->counter), "=qm" (c)
79 : "er" (i), "m" (v->counter) : "memory");
80 return c;
81}
82
83/**
84 * atomic64_inc - increment atomic64 variable
85 * @v: pointer to type atomic64_t
86 *
87 * Atomically increments @v by 1.
88 */
89static inline void atomic64_inc(atomic64_t *v)
90{
91 asm volatile(LOCK_PREFIX "incq %0"
92 : "=m" (v->counter)
93 : "m" (v->counter));
94}
95
96/**
97 * atomic64_dec - decrement atomic64 variable
98 * @v: pointer to type atomic64_t
99 *
100 * Atomically decrements @v by 1.
101 */
102static inline void atomic64_dec(atomic64_t *v)
103{
104 asm volatile(LOCK_PREFIX "decq %0"
105 : "=m" (v->counter)
106 : "m" (v->counter));
107}
108
109/**
110 * atomic64_dec_and_test - decrement and test
111 * @v: pointer to type atomic64_t
112 *
113 * Atomically decrements @v by 1 and
114 * returns true if the result is 0, or false for all other
115 * cases.
116 */
117static inline int atomic64_dec_and_test(atomic64_t *v)
118{
119 unsigned char c;
120
121 asm volatile(LOCK_PREFIX "decq %0; sete %1"
122 : "=m" (v->counter), "=qm" (c)
123 : "m" (v->counter) : "memory");
124 return c != 0;
125}
126
127/**
128 * atomic64_inc_and_test - increment and test
129 * @v: pointer to type atomic64_t
130 *
131 * Atomically increments @v by 1
132 * and returns true if the result is zero, or false for all
133 * other cases.
134 */
135static inline int atomic64_inc_and_test(atomic64_t *v)
136{
137 unsigned char c;
138
139 asm volatile(LOCK_PREFIX "incq %0; sete %1"
140 : "=m" (v->counter), "=qm" (c)
141 : "m" (v->counter) : "memory");
142 return c != 0;
143}
144
145/**
146 * atomic64_add_negative - add and test if negative
147 * @i: integer value to add
148 * @v: pointer to type atomic64_t
149 *
150 * Atomically adds @i to @v and returns true
151 * if the result is negative, or false when
152 * result is greater than or equal to zero.
153 */
154static inline int atomic64_add_negative(long i, atomic64_t *v)
155{
156 unsigned char c;
157
158 asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
159 : "=m" (v->counter), "=qm" (c)
160 : "er" (i), "m" (v->counter) : "memory");
161 return c;
162}
163
164/**
165 * atomic64_add_return - add and return
166 * @i: integer value to add
167 * @v: pointer to type atomic64_t
168 *
169 * Atomically adds @i to @v and returns @i + @v
170 */
171static inline long atomic64_add_return(long i, atomic64_t *v)
172{
173 long __i = i;
174 asm volatile(LOCK_PREFIX "xaddq %0, %1;"
175 : "+r" (i), "+m" (v->counter)
176 : : "memory");
177 return i + __i;
178}
179
180static inline long atomic64_sub_return(long i, atomic64_t *v)
181{
182 return atomic64_add_return(-i, v);
183}
184
185#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
186#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
187
188static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
189{
190 return cmpxchg(&v->counter, old, new);
191}
192
193static inline long atomic64_xchg(atomic64_t *v, long new)
194{
195 return xchg(&v->counter, new);
196}
197
198/**
199 * atomic64_add_unless - add unless the number is a given value
200 * @v: pointer of type atomic64_t
201 * @a: the amount to add to v...
202 * @u: ...unless v is equal to u.
203 *
204 * Atomically adds @a to @v, so long as it was not @u.
205 * Returns non-zero if @v was not @u, and zero otherwise.
206 */
207static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
208{
209 long c, old;
210 c = atomic64_read(v);
211 for (;;) {
212 if (unlikely(c == (u)))
213 break;
214 old = atomic64_cmpxchg((v), c, c + (a));
215 if (likely(old == c))
216 break;
217 c = old;
218 }
219 return c != (u);
220}
221
222#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
223
224#endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
deleted file mode 100644
index dc5a667ff791..000000000000
--- a/arch/x86/include/asm/atomic_32.h
+++ /dev/null
@@ -1,415 +0,0 @@
1#ifndef _ASM_X86_ATOMIC_32_H
2#define _ASM_X86_ATOMIC_32_H
3
4#include <linux/compiler.h>
5#include <linux/types.h>
6#include <asm/processor.h>
7#include <asm/cmpxchg.h>
8
9/*
10 * Atomic operations that C can't guarantee us. Useful for
11 * resource counting etc..
12 */
13
14#define ATOMIC_INIT(i) { (i) }
15
16/**
17 * atomic_read - read atomic variable
18 * @v: pointer of type atomic_t
19 *
20 * Atomically reads the value of @v.
21 */
22static inline int atomic_read(const atomic_t *v)
23{
24 return v->counter;
25}
26
27/**
28 * atomic_set - set atomic variable
29 * @v: pointer of type atomic_t
30 * @i: required value
31 *
32 * Atomically sets the value of @v to @i.
33 */
34static inline void atomic_set(atomic_t *v, int i)
35{
36 v->counter = i;
37}
38
39/**
40 * atomic_add - add integer to atomic variable
41 * @i: integer value to add
42 * @v: pointer of type atomic_t
43 *
44 * Atomically adds @i to @v.
45 */
46static inline void atomic_add(int i, atomic_t *v)
47{
48 asm volatile(LOCK_PREFIX "addl %1,%0"
49 : "+m" (v->counter)
50 : "ir" (i));
51}
52
53/**
54 * atomic_sub - subtract integer from atomic variable
55 * @i: integer value to subtract
56 * @v: pointer of type atomic_t
57 *
58 * Atomically subtracts @i from @v.
59 */
60static inline void atomic_sub(int i, atomic_t *v)
61{
62 asm volatile(LOCK_PREFIX "subl %1,%0"
63 : "+m" (v->counter)
64 : "ir" (i));
65}
66
67/**
68 * atomic_sub_and_test - subtract value from variable and test result
69 * @i: integer value to subtract
70 * @v: pointer of type atomic_t
71 *
72 * Atomically subtracts @i from @v and returns
73 * true if the result is zero, or false for all
74 * other cases.
75 */
76static inline int atomic_sub_and_test(int i, atomic_t *v)
77{
78 unsigned char c;
79
80 asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
81 : "+m" (v->counter), "=qm" (c)
82 : "ir" (i) : "memory");
83 return c;
84}
85
86/**
87 * atomic_inc - increment atomic variable
88 * @v: pointer of type atomic_t
89 *
90 * Atomically increments @v by 1.
91 */
92static inline void atomic_inc(atomic_t *v)
93{
94 asm volatile(LOCK_PREFIX "incl %0"
95 : "+m" (v->counter));
96}
97
98/**
99 * atomic_dec - decrement atomic variable
100 * @v: pointer of type atomic_t
101 *
102 * Atomically decrements @v by 1.
103 */
104static inline void atomic_dec(atomic_t *v)
105{
106 asm volatile(LOCK_PREFIX "decl %0"
107 : "+m" (v->counter));
108}
109
110/**
111 * atomic_dec_and_test - decrement and test
112 * @v: pointer of type atomic_t
113 *
114 * Atomically decrements @v by 1 and
115 * returns true if the result is 0, or false for all other
116 * cases.
117 */
118static inline int atomic_dec_and_test(atomic_t *v)
119{
120 unsigned char c;
121
122 asm volatile(LOCK_PREFIX "decl %0; sete %1"
123 : "+m" (v->counter), "=qm" (c)
124 : : "memory");
125 return c != 0;
126}
127
128/**
129 * atomic_inc_and_test - increment and test
130 * @v: pointer of type atomic_t
131 *
132 * Atomically increments @v by 1
133 * and returns true if the result is zero, or false for all
134 * other cases.
135 */
136static inline int atomic_inc_and_test(atomic_t *v)
137{
138 unsigned char c;
139
140 asm volatile(LOCK_PREFIX "incl %0; sete %1"
141 : "+m" (v->counter), "=qm" (c)
142 : : "memory");
143 return c != 0;
144}
145
146/**
147 * atomic_add_negative - add and test if negative
148 * @v: pointer of type atomic_t
149 * @i: integer value to add
150 *
151 * Atomically adds @i to @v and returns true
152 * if the result is negative, or false when
153 * result is greater than or equal to zero.
154 */
155static inline int atomic_add_negative(int i, atomic_t *v)
156{
157 unsigned char c;
158
159 asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
160 : "+m" (v->counter), "=qm" (c)
161 : "ir" (i) : "memory");
162 return c;
163}
164
165/**
166 * atomic_add_return - add integer and return
167 * @v: pointer of type atomic_t
168 * @i: integer value to add
169 *
170 * Atomically adds @i to @v and returns @i + @v
171 */
172static inline int atomic_add_return(int i, atomic_t *v)
173{
174 int __i;
175#ifdef CONFIG_M386
176 unsigned long flags;
177 if (unlikely(boot_cpu_data.x86 <= 3))
178 goto no_xadd;
179#endif
180 /* Modern 486+ processor */
181 __i = i;
182 asm volatile(LOCK_PREFIX "xaddl %0, %1"
183 : "+r" (i), "+m" (v->counter)
184 : : "memory");
185 return i + __i;
186
187#ifdef CONFIG_M386
188no_xadd: /* Legacy 386 processor */
189 local_irq_save(flags);
190 __i = atomic_read(v);
191 atomic_set(v, i + __i);
192 local_irq_restore(flags);
193 return i + __i;
194#endif
195}
196
197/**
198 * atomic_sub_return - subtract integer and return
199 * @v: pointer of type atomic_t
200 * @i: integer value to subtract
201 *
202 * Atomically subtracts @i from @v and returns @v - @i
203 */
204static inline int atomic_sub_return(int i, atomic_t *v)
205{
206 return atomic_add_return(-i, v);
207}
208
209static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
210{
211 return cmpxchg(&v->counter, old, new);
212}
213
214static inline int atomic_xchg(atomic_t *v, int new)
215{
216 return xchg(&v->counter, new);
217}
218
219/**
220 * atomic_add_unless - add unless the number is already a given value
221 * @v: pointer of type atomic_t
222 * @a: the amount to add to v...
223 * @u: ...unless v is equal to u.
224 *
225 * Atomically adds @a to @v, so long as @v was not already @u.
226 * Returns non-zero if @v was not @u, and zero otherwise.
227 */
228static inline int atomic_add_unless(atomic_t *v, int a, int u)
229{
230 int c, old;
231 c = atomic_read(v);
232 for (;;) {
233 if (unlikely(c == (u)))
234 break;
235 old = atomic_cmpxchg((v), c, c + (a));
236 if (likely(old == c))
237 break;
238 c = old;
239 }
240 return c != (u);
241}
242
243#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
244
245#define atomic_inc_return(v) (atomic_add_return(1, v))
246#define atomic_dec_return(v) (atomic_sub_return(1, v))
247
248/* These are x86-specific, used by some header files */
249#define atomic_clear_mask(mask, addr) \
250 asm volatile(LOCK_PREFIX "andl %0,%1" \
251 : : "r" (~(mask)), "m" (*(addr)) : "memory")
252
253#define atomic_set_mask(mask, addr) \
254 asm volatile(LOCK_PREFIX "orl %0,%1" \
255 : : "r" (mask), "m" (*(addr)) : "memory")
256
257/* Atomic operations are already serializing on x86 */
258#define smp_mb__before_atomic_dec() barrier()
259#define smp_mb__after_atomic_dec() barrier()
260#define smp_mb__before_atomic_inc() barrier()
261#define smp_mb__after_atomic_inc() barrier()
262
263/* An 64bit atomic type */
264
265typedef struct {
266 u64 __aligned(8) counter;
267} atomic64_t;
268
269#define ATOMIC64_INIT(val) { (val) }
270
271extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
272
273/**
274 * atomic64_xchg - xchg atomic64 variable
275 * @ptr: pointer to type atomic64_t
276 * @new_val: value to assign
277 *
278 * Atomically xchgs the value of @ptr to @new_val and returns
279 * the old value.
280 */
281extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
282
283/**
284 * atomic64_set - set atomic64 variable
285 * @ptr: pointer to type atomic64_t
286 * @new_val: value to assign
287 *
288 * Atomically sets the value of @ptr to @new_val.
289 */
290extern void atomic64_set(atomic64_t *ptr, u64 new_val);
291
292/**
293 * atomic64_read - read atomic64 variable
294 * @ptr: pointer to type atomic64_t
295 *
296 * Atomically reads the value of @ptr and returns it.
297 */
298static inline u64 atomic64_read(atomic64_t *ptr)
299{
300 u64 res;
301
302 /*
303 * Note, we inline this atomic64_t primitive because
304 * it only clobbers EAX/EDX and leaves the others
305 * untouched. We also (somewhat subtly) rely on the
306 * fact that cmpxchg8b returns the current 64-bit value
307 * of the memory location we are touching:
308 */
309 asm volatile(
310 "mov %%ebx, %%eax\n\t"
311 "mov %%ecx, %%edx\n\t"
312 LOCK_PREFIX "cmpxchg8b %1\n"
313 : "=&A" (res)
314 : "m" (*ptr)
315 );
316
317 return res;
318}
319
320extern u64 atomic64_read(atomic64_t *ptr);
321
322/**
323 * atomic64_add_return - add and return
324 * @delta: integer value to add
325 * @ptr: pointer to type atomic64_t
326 *
327 * Atomically adds @delta to @ptr and returns @delta + *@ptr
328 */
329extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
330
331/*
332 * Other variants with different arithmetic operators:
333 */
334extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
335extern u64 atomic64_inc_return(atomic64_t *ptr);
336extern u64 atomic64_dec_return(atomic64_t *ptr);
337
338/**
339 * atomic64_add - add integer to atomic64 variable
340 * @delta: integer value to add
341 * @ptr: pointer to type atomic64_t
342 *
343 * Atomically adds @delta to @ptr.
344 */
345extern void atomic64_add(u64 delta, atomic64_t *ptr);
346
347/**
348 * atomic64_sub - subtract the atomic64 variable
349 * @delta: integer value to subtract
350 * @ptr: pointer to type atomic64_t
351 *
352 * Atomically subtracts @delta from @ptr.
353 */
354extern void atomic64_sub(u64 delta, atomic64_t *ptr);
355
356/**
357 * atomic64_sub_and_test - subtract value from variable and test result
358 * @delta: integer value to subtract
359 * @ptr: pointer to type atomic64_t
360 *
361 * Atomically subtracts @delta from @ptr and returns
362 * true if the result is zero, or false for all
363 * other cases.
364 */
365extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
366
367/**
368 * atomic64_inc - increment atomic64 variable
369 * @ptr: pointer to type atomic64_t
370 *
371 * Atomically increments @ptr by 1.
372 */
373extern void atomic64_inc(atomic64_t *ptr);
374
375/**
376 * atomic64_dec - decrement atomic64 variable
377 * @ptr: pointer to type atomic64_t
378 *
379 * Atomically decrements @ptr by 1.
380 */
381extern void atomic64_dec(atomic64_t *ptr);
382
383/**
384 * atomic64_dec_and_test - decrement and test
385 * @ptr: pointer to type atomic64_t
386 *
387 * Atomically decrements @ptr by 1 and
388 * returns true if the result is 0, or false for all other
389 * cases.
390 */
391extern int atomic64_dec_and_test(atomic64_t *ptr);
392
393/**
394 * atomic64_inc_and_test - increment and test
395 * @ptr: pointer to type atomic64_t
396 *
397 * Atomically increments @ptr by 1
398 * and returns true if the result is zero, or false for all
399 * other cases.
400 */
401extern int atomic64_inc_and_test(atomic64_t *ptr);
402
403/**
404 * atomic64_add_negative - add and test if negative
405 * @delta: integer value to add
406 * @ptr: pointer to type atomic64_t
407 *
408 * Atomically adds @delta to @ptr and returns true
409 * if the result is negative, or false when
410 * result is greater than or equal to zero.
411 */
412extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
413
414#include <asm-generic/atomic-long.h>
415#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
deleted file mode 100644
index d605dc268e79..000000000000
--- a/arch/x86/include/asm/atomic_64.h
+++ /dev/null
@@ -1,485 +0,0 @@
1#ifndef _ASM_X86_ATOMIC_64_H
2#define _ASM_X86_ATOMIC_64_H
3
4#include <linux/types.h>
5#include <asm/alternative.h>
6#include <asm/cmpxchg.h>
7
8/*
9 * Atomic operations that C can't guarantee us. Useful for
10 * resource counting etc..
11 */
12
13#define ATOMIC_INIT(i) { (i) }
14
15/**
16 * atomic_read - read atomic variable
17 * @v: pointer of type atomic_t
18 *
19 * Atomically reads the value of @v.
20 */
21static inline int atomic_read(const atomic_t *v)
22{
23 return v->counter;
24}
25
26/**
27 * atomic_set - set atomic variable
28 * @v: pointer of type atomic_t
29 * @i: required value
30 *
31 * Atomically sets the value of @v to @i.
32 */
33static inline void atomic_set(atomic_t *v, int i)
34{
35 v->counter = i;
36}
37
38/**
39 * atomic_add - add integer to atomic variable
40 * @i: integer value to add
41 * @v: pointer of type atomic_t
42 *
43 * Atomically adds @i to @v.
44 */
45static inline void atomic_add(int i, atomic_t *v)
46{
47 asm volatile(LOCK_PREFIX "addl %1,%0"
48 : "=m" (v->counter)
49 : "ir" (i), "m" (v->counter));
50}
51
52/**
53 * atomic_sub - subtract the atomic variable
54 * @i: integer value to subtract
55 * @v: pointer of type atomic_t
56 *
57 * Atomically subtracts @i from @v.
58 */
59static inline void atomic_sub(int i, atomic_t *v)
60{
61 asm volatile(LOCK_PREFIX "subl %1,%0"
62 : "=m" (v->counter)
63 : "ir" (i), "m" (v->counter));
64}
65
66/**
67 * atomic_sub_and_test - subtract value from variable and test result
68 * @i: integer value to subtract
69 * @v: pointer of type atomic_t
70 *
71 * Atomically subtracts @i from @v and returns
72 * true if the result is zero, or false for all
73 * other cases.
74 */
75static inline int atomic_sub_and_test(int i, atomic_t *v)
76{
77 unsigned char c;
78
79 asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
80 : "=m" (v->counter), "=qm" (c)
81 : "ir" (i), "m" (v->counter) : "memory");
82 return c;
83}
84
85/**
86 * atomic_inc - increment atomic variable
87 * @v: pointer of type atomic_t
88 *
89 * Atomically increments @v by 1.
90 */
91static inline void atomic_inc(atomic_t *v)
92{
93 asm volatile(LOCK_PREFIX "incl %0"
94 : "=m" (v->counter)
95 : "m" (v->counter));
96}
97
98/**
99 * atomic_dec - decrement atomic variable
100 * @v: pointer of type atomic_t
101 *
102 * Atomically decrements @v by 1.
103 */
104static inline void atomic_dec(atomic_t *v)
105{
106 asm volatile(LOCK_PREFIX "decl %0"
107 : "=m" (v->counter)
108 : "m" (v->counter));
109}
110
111/**
112 * atomic_dec_and_test - decrement and test
113 * @v: pointer of type atomic_t
114 *
115 * Atomically decrements @v by 1 and
116 * returns true if the result is 0, or false for all other
117 * cases.
118 */
119static inline int atomic_dec_and_test(atomic_t *v)
120{
121 unsigned char c;
122
123 asm volatile(LOCK_PREFIX "decl %0; sete %1"
124 : "=m" (v->counter), "=qm" (c)
125 : "m" (v->counter) : "memory");
126 return c != 0;
127}
128
129/**
130 * atomic_inc_and_test - increment and test
131 * @v: pointer of type atomic_t
132 *
133 * Atomically increments @v by 1
134 * and returns true if the result is zero, or false for all
135 * other cases.
136 */
137static inline int atomic_inc_and_test(atomic_t *v)
138{
139 unsigned char c;
140
141 asm volatile(LOCK_PREFIX "incl %0; sete %1"
142 : "=m" (v->counter), "=qm" (c)
143 : "m" (v->counter) : "memory");
144 return c != 0;
145}
146
147/**
148 * atomic_add_negative - add and test if negative
149 * @i: integer value to add
150 * @v: pointer of type atomic_t
151 *
152 * Atomically adds @i to @v and returns true
153 * if the result is negative, or false when
154 * result is greater than or equal to zero.
155 */
156static inline int atomic_add_negative(int i, atomic_t *v)
157{
158 unsigned char c;
159
160 asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
161 : "=m" (v->counter), "=qm" (c)
162 : "ir" (i), "m" (v->counter) : "memory");
163 return c;
164}
165
166/**
167 * atomic_add_return - add and return
168 * @i: integer value to add
169 * @v: pointer of type atomic_t
170 *
171 * Atomically adds @i to @v and returns @i + @v
172 */
173static inline int atomic_add_return(int i, atomic_t *v)
174{
175 int __i = i;
176 asm volatile(LOCK_PREFIX "xaddl %0, %1"
177 : "+r" (i), "+m" (v->counter)
178 : : "memory");
179 return i + __i;
180}
181
182static inline int atomic_sub_return(int i, atomic_t *v)
183{
184 return atomic_add_return(-i, v);
185}
186
187#define atomic_inc_return(v) (atomic_add_return(1, v))
188#define atomic_dec_return(v) (atomic_sub_return(1, v))
189
190/* The 64-bit atomic type */
191
192#define ATOMIC64_INIT(i) { (i) }
193
194/**
195 * atomic64_read - read atomic64 variable
196 * @v: pointer of type atomic64_t
197 *
198 * Atomically reads the value of @v.
199 * Doesn't imply a read memory barrier.
200 */
201static inline long atomic64_read(const atomic64_t *v)
202{
203 return v->counter;
204}
205
206/**
207 * atomic64_set - set atomic64 variable
208 * @v: pointer to type atomic64_t
209 * @i: required value
210 *
211 * Atomically sets the value of @v to @i.
212 */
213static inline void atomic64_set(atomic64_t *v, long i)
214{
215 v->counter = i;
216}
217
218/**
219 * atomic64_add - add integer to atomic64 variable
220 * @i: integer value to add
221 * @v: pointer to type atomic64_t
222 *
223 * Atomically adds @i to @v.
224 */
225static inline void atomic64_add(long i, atomic64_t *v)
226{
227 asm volatile(LOCK_PREFIX "addq %1,%0"
228 : "=m" (v->counter)
229 : "er" (i), "m" (v->counter));
230}
231
232/**
233 * atomic64_sub - subtract the atomic64 variable
234 * @i: integer value to subtract
235 * @v: pointer to type atomic64_t
236 *
237 * Atomically subtracts @i from @v.
238 */
239static inline void atomic64_sub(long i, atomic64_t *v)
240{
241 asm volatile(LOCK_PREFIX "subq %1,%0"
242 : "=m" (v->counter)
243 : "er" (i), "m" (v->counter));
244}
245
246/**
247 * atomic64_sub_and_test - subtract value from variable and test result
248 * @i: integer value to subtract
249 * @v: pointer to type atomic64_t
250 *
251 * Atomically subtracts @i from @v and returns
252 * true if the result is zero, or false for all
253 * other cases.
254 */
255static inline int atomic64_sub_and_test(long i, atomic64_t *v)
256{
257 unsigned char c;
258
259 asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
260 : "=m" (v->counter), "=qm" (c)
261 : "er" (i), "m" (v->counter) : "memory");
262 return c;
263}
264
265/**
266 * atomic64_inc - increment atomic64 variable
267 * @v: pointer to type atomic64_t
268 *
269 * Atomically increments @v by 1.
270 */
271static inline void atomic64_inc(atomic64_t *v)
272{
273 asm volatile(LOCK_PREFIX "incq %0"
274 : "=m" (v->counter)
275 : "m" (v->counter));
276}
277
278/**
279 * atomic64_dec - decrement atomic64 variable
280 * @v: pointer to type atomic64_t
281 *
282 * Atomically decrements @v by 1.
283 */
284static inline void atomic64_dec(atomic64_t *v)
285{
286 asm volatile(LOCK_PREFIX "decq %0"
287 : "=m" (v->counter)
288 : "m" (v->counter));
289}
290
291/**
292 * atomic64_dec_and_test - decrement and test
293 * @v: pointer to type atomic64_t
294 *
295 * Atomically decrements @v by 1 and
296 * returns true if the result is 0, or false for all other
297 * cases.
298 */
299static inline int atomic64_dec_and_test(atomic64_t *v)
300{
301 unsigned char c;
302
303 asm volatile(LOCK_PREFIX "decq %0; sete %1"
304 : "=m" (v->counter), "=qm" (c)
305 : "m" (v->counter) : "memory");
306 return c != 0;
307}
308
309/**
310 * atomic64_inc_and_test - increment and test
311 * @v: pointer to type atomic64_t
312 *
313 * Atomically increments @v by 1
314 * and returns true if the result is zero, or false for all
315 * other cases.
316 */
317static inline int atomic64_inc_and_test(atomic64_t *v)
318{
319 unsigned char c;
320
321 asm volatile(LOCK_PREFIX "incq %0; sete %1"
322 : "=m" (v->counter), "=qm" (c)
323 : "m" (v->counter) : "memory");
324 return c != 0;
325}
326
327/**
328 * atomic64_add_negative - add and test if negative
329 * @i: integer value to add
330 * @v: pointer to type atomic64_t
331 *
332 * Atomically adds @i to @v and returns true
333 * if the result is negative, or false when
334 * result is greater than or equal to zero.
335 */
336static inline int atomic64_add_negative(long i, atomic64_t *v)
337{
338 unsigned char c;
339
340 asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
341 : "=m" (v->counter), "=qm" (c)
342 : "er" (i), "m" (v->counter) : "memory");
343 return c;
344}
345
346/**
347 * atomic64_add_return - add and return
348 * @i: integer value to add
349 * @v: pointer to type atomic64_t
350 *
351 * Atomically adds @i to @v and returns @i + @v
352 */
353static inline long atomic64_add_return(long i, atomic64_t *v)
354{
355 long __i = i;
356 asm volatile(LOCK_PREFIX "xaddq %0, %1;"
357 : "+r" (i), "+m" (v->counter)
358 : : "memory");
359 return i + __i;
360}
361
362static inline long atomic64_sub_return(long i, atomic64_t *v)
363{
364 return atomic64_add_return(-i, v);
365}
366
367#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
368#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
369
370static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
371{
372 return cmpxchg(&v->counter, old, new);
373}
374
375static inline long atomic64_xchg(atomic64_t *v, long new)
376{
377 return xchg(&v->counter, new);
378}
379
380static inline long atomic_cmpxchg(atomic_t *v, int old, int new)
381{
382 return cmpxchg(&v->counter, old, new);
383}
384
385static inline long atomic_xchg(atomic_t *v, int new)
386{
387 return xchg(&v->counter, new);
388}
389
390/**
391 * atomic_add_unless - add unless the number is a given value
392 * @v: pointer of type atomic_t
393 * @a: the amount to add to v...
394 * @u: ...unless v is equal to u.
395 *
396 * Atomically adds @a to @v, so long as it was not @u.
397 * Returns non-zero if @v was not @u, and zero otherwise.
398 */
399static inline int atomic_add_unless(atomic_t *v, int a, int u)
400{
401 int c, old;
402 c = atomic_read(v);
403 for (;;) {
404 if (unlikely(c == (u)))
405 break;
406 old = atomic_cmpxchg((v), c, c + (a));
407 if (likely(old == c))
408 break;
409 c = old;
410 }
411 return c != (u);
412}
413
414#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
415
416/**
417 * atomic64_add_unless - add unless the number is a given value
418 * @v: pointer of type atomic64_t
419 * @a: the amount to add to v...
420 * @u: ...unless v is equal to u.
421 *
422 * Atomically adds @a to @v, so long as it was not @u.
423 * Returns non-zero if @v was not @u, and zero otherwise.
424 */
425static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
426{
427 long c, old;
428 c = atomic64_read(v);
429 for (;;) {
430 if (unlikely(c == (u)))
431 break;
432 old = atomic64_cmpxchg((v), c, c + (a));
433 if (likely(old == c))
434 break;
435 c = old;
436 }
437 return c != (u);
438}
439
440/**
441 * atomic_inc_short - increment of a short integer
442 * @v: pointer to type int
443 *
444 * Atomically adds 1 to @v
445 * Returns the new value of @u
446 */
447static inline short int atomic_inc_short(short int *v)
448{
449 asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
450 return *v;
451}
452
453/**
454 * atomic_or_long - OR of two long integers
455 * @v1: pointer to type unsigned long
456 * @v2: pointer to type unsigned long
457 *
458 * Atomically ORs @v1 and @v2
459 * Returns the result of the OR
460 */
461static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
462{
463 asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2));
464}
465
466#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
467
468/* These are x86-specific, used by some header files */
469#define atomic_clear_mask(mask, addr) \
470 asm volatile(LOCK_PREFIX "andl %0,%1" \
471 : : "r" (~(mask)), "m" (*(addr)) : "memory")
472
473#define atomic_set_mask(mask, addr) \
474 asm volatile(LOCK_PREFIX "orl %0,%1" \
475 : : "r" ((unsigned)(mask)), "m" (*(addr)) \
476 : "memory")
477
478/* Atomic operations are already serializing on x86 */
479#define smp_mb__before_atomic_dec() barrier()
480#define smp_mb__after_atomic_dec() barrier()
481#define smp_mb__before_atomic_inc() barrier()
482#define smp_mb__after_atomic_inc() barrier()
483
484#include <asm-generic/atomic-long.h>
485#endif /* _ASM_X86_ATOMIC_64_H */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 9a9c7bdc923d..306160e58b48 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -8,7 +8,8 @@
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <asm/user32.h> 9#include <asm/user32.h>
10 10
11#define COMPAT_USER_HZ 100 11#define COMPAT_USER_HZ 100
12#define COMPAT_UTS_MACHINE "i686\0\0"
12 13
13typedef u32 compat_size_t; 14typedef u32 compat_size_t;
14typedef s32 compat_ssize_t; 15typedef s32 compat_ssize_t;
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 637e1ec963c3..0cd82d068613 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -168,6 +168,10 @@
168#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ 168#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
169#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ 169#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
170#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ 170#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
171#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */
172#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */
173#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */
174#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */
171 175
172#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 176#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
173 177
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 8240f76b531e..b81002f23614 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -14,6 +14,9 @@
14 which debugging register was responsible for the trap. The other bits 14 which debugging register was responsible for the trap. The other bits
15 are either reserved or not of interest to us. */ 15 are either reserved or not of interest to us. */
16 16
17/* Define reserved bits in DR6 which are always set to 1 */
18#define DR6_RESERVED (0xFFFF0FF0)
19
17#define DR_TRAP0 (0x1) /* db0 */ 20#define DR_TRAP0 (0x1) /* db0 */
18#define DR_TRAP1 (0x2) /* db1 */ 21#define DR_TRAP1 (0x2) /* db1 */
19#define DR_TRAP2 (0x4) /* db2 */ 22#define DR_TRAP2 (0x4) /* db2 */
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 761249e396fe..0e22296790d3 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -111,11 +111,8 @@ extern unsigned long end_user_pfn;
111 111
112extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); 112extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
113extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); 113extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
114extern void reserve_early(u64 start, u64 end, char *name);
115extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
116extern void free_early(u64 start, u64 end);
117extern void early_res_to_bootmem(u64 start, u64 end);
118extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); 114extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
115#include <linux/early_res.h>
119 116
120extern unsigned long e820_end_of_ram_pfn(void); 117extern unsigned long e820_end_of_ram_pfn(void);
121extern unsigned long e820_end_of_low_ram_pfn(void); 118extern unsigned long e820_end_of_low_ram_pfn(void);
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 1994d3f58443..f2ad2163109d 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -170,10 +170,7 @@ static inline void elf_common_init(struct thread_struct *t,
170} 170}
171 171
172#define ELF_PLAT_INIT(_r, load_addr) \ 172#define ELF_PLAT_INIT(_r, load_addr) \
173do { \ 173 elf_common_init(&current->thread, _r, 0)
174 elf_common_init(&current->thread, _r, 0); \
175 clear_thread_flag(TIF_IA32); \
176} while (0)
177 174
178#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ 175#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
179 elf_common_init(&current->thread, regs, __USER_DS) 176 elf_common_init(&current->thread, regs, __USER_DS)
diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h
index 53018464aea6..2519d0679d99 100644
--- a/arch/x86/include/asm/fb.h
+++ b/arch/x86/include/asm/fb.h
@@ -12,10 +12,6 @@ static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
12 pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; 12 pgprot_val(vma->vm_page_prot) |= _PAGE_PCD;
13} 13}
14 14
15#ifdef CONFIG_X86_32
16extern int fb_is_primary_device(struct fb_info *info); 15extern int fb_is_primary_device(struct fb_info *info);
17#else
18static inline int fb_is_primary_device(struct fb_info *info) { return 0; }
19#endif
20 16
21#endif /* _ASM_X86_FB_H */ 17#endif /* _ASM_X86_FB_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 14f9890eb495..635f03bb4995 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -118,14 +118,20 @@ enum fixed_addresses {
118 * 256 temporary boot-time mappings, used by early_ioremap(), 118 * 256 temporary boot-time mappings, used by early_ioremap(),
119 * before ioremap() is functional. 119 * before ioremap() is functional.
120 * 120 *
121 * We round it up to the next 256 pages boundary so that we 121 * If necessary we round it up to the next 256 pages boundary so
122 * can have a single pgd entry and a single pte table: 122 * that we can have a single pgd entry and a single pte table:
123 */ 123 */
124#define NR_FIX_BTMAPS 64 124#define NR_FIX_BTMAPS 64
125#define FIX_BTMAPS_SLOTS 4 125#define FIX_BTMAPS_SLOTS 4
126 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - 126#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
127 (__end_of_permanent_fixed_addresses & 255), 127 FIX_BTMAP_END =
128 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, 128 (__end_of_permanent_fixed_addresses ^
129 (__end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - 1)) &
130 -PTRS_PER_PTE
131 ? __end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS -
132 (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1))
133 : __end_of_permanent_fixed_addresses,
134 FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
129#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT 135#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
130 FIX_OHCI1394_BASE, 136 FIX_OHCI1394_BASE,
131#endif 137#endif
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 014c2b85ae45..a726650fc80f 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -66,10 +66,6 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); 66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
67struct page *kmap_atomic_to_page(void *ptr); 67struct page *kmap_atomic_to_page(void *ptr);
68 68
69#ifndef CONFIG_PARAVIRT
70#define kmap_atomic_pte(page, type) kmap_atomic(page, type)
71#endif
72
73#define flush_cache_kmaps() do { } while (0) 69#define flush_cache_kmaps() do { } while (0)
74 70
75extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, 71extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 0675a7c4c20e..2a1bd8f4f23a 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -10,7 +10,6 @@
10 * (display/resolving) 10 * (display/resolving)
11 */ 11 */
12struct arch_hw_breakpoint { 12struct arch_hw_breakpoint {
13 char *name; /* Contains name of the symbol to set bkpt */
14 unsigned long address; 13 unsigned long address;
15 u8 len; 14 u8 len;
16 u8 type; 15 u8 type;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index eeac829a0f44..a929c9ede33d 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -53,13 +53,6 @@ extern void threshold_interrupt(void);
53extern void call_function_interrupt(void); 53extern void call_function_interrupt(void);
54extern void call_function_single_interrupt(void); 54extern void call_function_single_interrupt(void);
55 55
56/* PIC specific functions */
57extern void disable_8259A_irq(unsigned int irq);
58extern void enable_8259A_irq(unsigned int irq);
59extern int i8259A_irq_pending(unsigned int irq);
60extern void make_8259A_irq(unsigned int irq);
61extern void init_8259A(int aeoi);
62
63/* IOAPIC */ 56/* IOAPIC */
64#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs)) 57#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
65extern unsigned long io_apic_irqs; 58extern unsigned long io_apic_irqs;
diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h
new file mode 100644
index 000000000000..e153a2b3889a
--- /dev/null
+++ b/arch/x86/include/asm/hyperv.h
@@ -0,0 +1,186 @@
1#ifndef _ASM_X86_KVM_HYPERV_H
2#define _ASM_X86_KVM_HYPERV_H
3
4#include <linux/types.h>
5
6/*
7 * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
8 * is set by CPUID(HvCpuIdFunctionVersionAndFeatures).
9 */
10#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000
11#define HYPERV_CPUID_INTERFACE 0x40000001
12#define HYPERV_CPUID_VERSION 0x40000002
13#define HYPERV_CPUID_FEATURES 0x40000003
14#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
15#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
16
17/*
18 * Feature identification. EAX indicates which features are available
19 * to the partition based upon the current partition privileges.
20 */
21
22/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */
23#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0)
24/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
25#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
26/*
27 * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
28 * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
29 */
30#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2)
31/*
32 * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through
33 * HV_X64_MSR_STIMER3_COUNT) available
34 */
35#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3)
36/*
37 * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR)
38 * are available
39 */
40#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4)
41/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/
42#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5)
43/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/
44#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6)
45/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/
46#define HV_X64_MSR_RESET_AVAILABLE (1 << 7)
47 /*
48 * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE,
49 * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE,
50 * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available
51 */
52#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8)
53
54/*
55 * Feature identification: EBX indicates which flags were specified at
56 * partition creation. The format is the same as the partition creation
57 * flag structure defined in section Partition Creation Flags.
58 */
59#define HV_X64_CREATE_PARTITIONS (1 << 0)
60#define HV_X64_ACCESS_PARTITION_ID (1 << 1)
61#define HV_X64_ACCESS_MEMORY_POOL (1 << 2)
62#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3)
63#define HV_X64_POST_MESSAGES (1 << 4)
64#define HV_X64_SIGNAL_EVENTS (1 << 5)
65#define HV_X64_CREATE_PORT (1 << 6)
66#define HV_X64_CONNECT_PORT (1 << 7)
67#define HV_X64_ACCESS_STATS (1 << 8)
68#define HV_X64_DEBUGGING (1 << 11)
69#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12)
70#define HV_X64_CONFIGURE_PROFILER (1 << 13)
71
72/*
73 * Feature identification. EDX indicates which miscellaneous features
74 * are available to the partition.
75 */
76/* The MWAIT instruction is available (per section MONITOR / MWAIT) */
77#define HV_X64_MWAIT_AVAILABLE (1 << 0)
78/* Guest debugging support is available */
79#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1)
80/* Performance Monitor support is available*/
81#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2)
82/* Support for physical CPU dynamic partitioning events is available*/
83#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3)
84/*
85 * Support for passing hypercall input parameter block via XMM
86 * registers is available
87 */
88#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4)
89/* Support for a virtual guest idle state is available */
90#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
91
92/*
93 * Implementation recommendations. Indicates which behaviors the hypervisor
94 * recommends the OS implement for optimal performance.
95 */
96 /*
97 * Recommend using hypercall for address space switches rather
98 * than MOV to CR3 instruction
99 */
100#define HV_X64_MWAIT_RECOMMENDED (1 << 0)
101/* Recommend using hypercall for local TLB flushes rather
102 * than INVLPG or MOV to CR3 instructions */
103#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1)
104/*
105 * Recommend using hypercall for remote TLB flushes rather
106 * than inter-processor interrupts
107 */
108#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2)
109/*
110 * Recommend using MSRs for accessing APIC registers
111 * EOI, ICR and TPR rather than their memory-mapped counterparts
112 */
113#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3)
114/* Recommend using the hypervisor-provided MSR to initiate a system RESET */
115#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4)
116/*
117 * Recommend using relaxed timing for this partition. If used,
118 * the VM should disable any watchdog timeouts that rely on the
119 * timely delivery of external interrupts
120 */
121#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5)
122
123/* MSR used to identify the guest OS. */
124#define HV_X64_MSR_GUEST_OS_ID 0x40000000
125
126/* MSR used to setup pages used to communicate with the hypervisor. */
127#define HV_X64_MSR_HYPERCALL 0x40000001
128
129/* MSR used to provide vcpu index */
130#define HV_X64_MSR_VP_INDEX 0x40000002
131
132/* Define the virtual APIC registers */
133#define HV_X64_MSR_EOI 0x40000070
134#define HV_X64_MSR_ICR 0x40000071
135#define HV_X64_MSR_TPR 0x40000072
136#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073
137
138/* Define synthetic interrupt controller model specific registers. */
139#define HV_X64_MSR_SCONTROL 0x40000080
140#define HV_X64_MSR_SVERSION 0x40000081
141#define HV_X64_MSR_SIEFP 0x40000082
142#define HV_X64_MSR_SIMP 0x40000083
143#define HV_X64_MSR_EOM 0x40000084
144#define HV_X64_MSR_SINT0 0x40000090
145#define HV_X64_MSR_SINT1 0x40000091
146#define HV_X64_MSR_SINT2 0x40000092
147#define HV_X64_MSR_SINT3 0x40000093
148#define HV_X64_MSR_SINT4 0x40000094
149#define HV_X64_MSR_SINT5 0x40000095
150#define HV_X64_MSR_SINT6 0x40000096
151#define HV_X64_MSR_SINT7 0x40000097
152#define HV_X64_MSR_SINT8 0x40000098
153#define HV_X64_MSR_SINT9 0x40000099
154#define HV_X64_MSR_SINT10 0x4000009A
155#define HV_X64_MSR_SINT11 0x4000009B
156#define HV_X64_MSR_SINT12 0x4000009C
157#define HV_X64_MSR_SINT13 0x4000009D
158#define HV_X64_MSR_SINT14 0x4000009E
159#define HV_X64_MSR_SINT15 0x4000009F
160
161
162#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
163#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
164#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
165 (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
166
167/* Declare the various hypercall operations. */
168#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008
169
170#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
171#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
172#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
173 (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
174
175#define HV_PROCESSOR_POWER_STATE_C0 0
176#define HV_PROCESSOR_POWER_STATE_C1 1
177#define HV_PROCESSOR_POWER_STATE_C2 2
178#define HV_PROCESSOR_POWER_STATE_C3 3
179
180/* hypercall status code */
181#define HV_STATUS_SUCCESS 0
182#define HV_STATUS_INVALID_HYPERCALL_CODE 2
183#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
184#define HV_STATUS_INVALID_ALIGNMENT 4
185
186#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index ebfb8a9e11f7..da2930924501 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -33,8 +33,16 @@ extern void init_thread_xstate(void);
33extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); 33extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
34 34
35extern user_regset_active_fn fpregs_active, xfpregs_active; 35extern user_regset_active_fn fpregs_active, xfpregs_active;
36extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get; 36extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
37extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set; 37 xstateregs_get;
38extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
39 xstateregs_set;
40
41/*
42 * xstateregs_active == fpregs_active. Please refer to the comment
43 * at the definition of fpregs_active.
44 */
45#define xstateregs_active fpregs_active
38 46
39extern struct _fpx_sw_bytes fx_sw_reserved; 47extern struct _fpx_sw_bytes fx_sw_reserved;
40#ifdef CONFIG_IA32_EMULATION 48#ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 58d7091eeb1f..1655147646aa 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -24,12 +24,7 @@ extern unsigned int cached_irq_mask;
24#define SLAVE_ICW4_DEFAULT 0x01 24#define SLAVE_ICW4_DEFAULT 0x01
25#define PIC_ICW4_AEOI 2 25#define PIC_ICW4_AEOI 2
26 26
27extern spinlock_t i8259A_lock; 27extern raw_spinlock_t i8259A_lock;
28
29extern void init_8259A(int auto_eoi);
30extern void enable_8259A_irq(unsigned int irq);
31extern void disable_8259A_irq(unsigned int irq);
32extern unsigned int startup_8259A_irq(unsigned int irq);
33 28
34/* the PIC may need a careful delay on some platforms, hence specific calls */ 29/* the PIC may need a careful delay on some platforms, hence specific calls */
35static inline unsigned char inb_pic(unsigned int port) 30static inline unsigned char inb_pic(unsigned int port)
@@ -57,7 +52,17 @@ static inline void outb_pic(unsigned char value, unsigned int port)
57 52
58extern struct irq_chip i8259A_chip; 53extern struct irq_chip i8259A_chip;
59 54
60extern void mask_8259A(void); 55struct legacy_pic {
61extern void unmask_8259A(void); 56 int nr_legacy_irqs;
57 struct irq_chip *chip;
58 void (*mask_all)(void);
59 void (*restore_mask)(void);
60 void (*init)(int auto_eoi);
61 int (*irq_pending)(unsigned int irq);
62 void (*make_irq)(unsigned int irq);
63};
64
65extern struct legacy_pic *legacy_pic;
66extern struct legacy_pic null_legacy_pic;
62 67
63#endif /* _ASM_X86_I8259_H */ 68#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 73739322b6d0..a1dcfa3ab17d 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -1,8 +1,42 @@
1#ifndef _ASM_X86_IO_H 1#ifndef _ASM_X86_IO_H
2#define _ASM_X86_IO_H 2#define _ASM_X86_IO_H
3 3
4/*
5 * This file contains the definitions for the x86 IO instructions
6 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
7 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
8 * versions of the single-IO instructions (inb_p/inw_p/..).
9 *
10 * This file is not meant to be obfuscating: it's just complicated
11 * to (a) handle it all in a way that makes gcc able to optimize it
12 * as well as possible and (b) trying to avoid writing the same thing
13 * over and over again with slight variations and possibly making a
14 * mistake somewhere.
15 */
16
17/*
18 * Thanks to James van Artsdalen for a better timing-fix than
19 * the two short jumps: using outb's to a nonexistent port seems
20 * to guarantee better timings even on fast machines.
21 *
22 * On the other hand, I'd like to be sure of a non-existent port:
23 * I feel a bit unsafe about using 0x80 (should be safe, though)
24 *
25 * Linus
26 */
27
28 /*
29 * Bit simplified and optimized by Jan Hubicka
30 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
31 *
32 * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
33 * isa_read[wl] and isa_write[wl] fixed
34 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
35 */
36
4#define ARCH_HAS_IOREMAP_WC 37#define ARCH_HAS_IOREMAP_WC
5 38
39#include <linux/string.h>
6#include <linux/compiler.h> 40#include <linux/compiler.h>
7#include <asm-generic/int-ll64.h> 41#include <asm-generic/int-ll64.h>
8#include <asm/page.h> 42#include <asm/page.h>
@@ -173,11 +207,126 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
173extern void iounmap(volatile void __iomem *addr); 207extern void iounmap(volatile void __iomem *addr);
174 208
175 209
176#ifdef CONFIG_X86_32 210#ifdef __KERNEL__
177# include "io_32.h" 211
212#include <asm-generic/iomap.h>
213
214#include <linux/vmalloc.h>
215
216/*
217 * Convert a virtual cached pointer to an uncached pointer
218 */
219#define xlate_dev_kmem_ptr(p) p
220
221static inline void
222memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
223{
224 memset((void __force *)addr, val, count);
225}
226
227static inline void
228memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
229{
230 memcpy(dst, (const void __force *)src, count);
231}
232
233static inline void
234memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
235{
236 memcpy((void __force *)dst, src, count);
237}
238
239/*
240 * ISA space is 'always mapped' on a typical x86 system, no need to
241 * explicitly ioremap() it. The fact that the ISA IO space is mapped
242 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
243 * are physical addresses. The following constant pointer can be
244 * used as the IO-area pointer (it can be iounmapped as well, so the
245 * analogy with PCI is quite large):
246 */
247#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
248
249/*
250 * Cache management
251 *
252 * This needed for two cases
253 * 1. Out of order aware processors
254 * 2. Accidentally out of order processors (PPro errata #51)
255 */
256
257static inline void flush_write_buffers(void)
258{
259#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
260 asm volatile("lock; addl $0,0(%%esp)": : :"memory");
261#endif
262}
263
264#endif /* __KERNEL__ */
265
266extern void native_io_delay(void);
267
268extern int io_delay_type;
269extern void io_delay_init(void);
270
271#if defined(CONFIG_PARAVIRT)
272#include <asm/paravirt.h>
178#else 273#else
179# include "io_64.h" 274
275static inline void slow_down_io(void)
276{
277 native_io_delay();
278#ifdef REALLY_SLOW_IO
279 native_io_delay();
280 native_io_delay();
281 native_io_delay();
180#endif 282#endif
283}
284
285#endif
286
287#define BUILDIO(bwl, bw, type) \
288static inline void out##bwl(unsigned type value, int port) \
289{ \
290 asm volatile("out" #bwl " %" #bw "0, %w1" \
291 : : "a"(value), "Nd"(port)); \
292} \
293 \
294static inline unsigned type in##bwl(int port) \
295{ \
296 unsigned type value; \
297 asm volatile("in" #bwl " %w1, %" #bw "0" \
298 : "=a"(value) : "Nd"(port)); \
299 return value; \
300} \
301 \
302static inline void out##bwl##_p(unsigned type value, int port) \
303{ \
304 out##bwl(value, port); \
305 slow_down_io(); \
306} \
307 \
308static inline unsigned type in##bwl##_p(int port) \
309{ \
310 unsigned type value = in##bwl(port); \
311 slow_down_io(); \
312 return value; \
313} \
314 \
315static inline void outs##bwl(int port, const void *addr, unsigned long count) \
316{ \
317 asm volatile("rep; outs" #bwl \
318 : "+S"(addr), "+c"(count) : "d"(port)); \
319} \
320 \
321static inline void ins##bwl(int port, void *addr, unsigned long count) \
322{ \
323 asm volatile("rep; ins" #bwl \
324 : "+D"(addr), "+c"(count) : "d"(port)); \
325}
326
327BUILDIO(b, b, char)
328BUILDIO(w, w, short)
329BUILDIO(l, , int)
181 330
182extern void *xlate_dev_mem_ptr(unsigned long phys); 331extern void *xlate_dev_mem_ptr(unsigned long phys);
183extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); 332extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h
deleted file mode 100644
index a299900f5920..000000000000
--- a/arch/x86/include/asm/io_32.h
+++ /dev/null
@@ -1,196 +0,0 @@
1#ifndef _ASM_X86_IO_32_H
2#define _ASM_X86_IO_32_H
3
4#include <linux/string.h>
5#include <linux/compiler.h>
6
7/*
8 * This file contains the definitions for the x86 IO instructions
9 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
10 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
11 * versions of the single-IO instructions (inb_p/inw_p/..).
12 *
13 * This file is not meant to be obfuscating: it's just complicated
14 * to (a) handle it all in a way that makes gcc able to optimize it
15 * as well as possible and (b) trying to avoid writing the same thing
16 * over and over again with slight variations and possibly making a
17 * mistake somewhere.
18 */
19
20/*
21 * Thanks to James van Artsdalen for a better timing-fix than
22 * the two short jumps: using outb's to a nonexistent port seems
23 * to guarantee better timings even on fast machines.
24 *
25 * On the other hand, I'd like to be sure of a non-existent port:
26 * I feel a bit unsafe about using 0x80 (should be safe, though)
27 *
28 * Linus
29 */
30
31 /*
32 * Bit simplified and optimized by Jan Hubicka
33 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
34 *
35 * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
36 * isa_read[wl] and isa_write[wl] fixed
37 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
38 */
39
40#define XQUAD_PORTIO_BASE 0xfe400000
41#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
42
43#ifdef __KERNEL__
44
45#include <asm-generic/iomap.h>
46
47#include <linux/vmalloc.h>
48
49/*
50 * Convert a virtual cached pointer to an uncached pointer
51 */
52#define xlate_dev_kmem_ptr(p) p
53
54static inline void
55memset_io(volatile void __iomem *addr, unsigned char val, int count)
56{
57 memset((void __force *)addr, val, count);
58}
59
60static inline void
61memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
62{
63 __memcpy(dst, (const void __force *)src, count);
64}
65
66static inline void
67memcpy_toio(volatile void __iomem *dst, const void *src, int count)
68{
69 __memcpy((void __force *)dst, src, count);
70}
71
72/*
73 * ISA space is 'always mapped' on a typical x86 system, no need to
74 * explicitly ioremap() it. The fact that the ISA IO space is mapped
75 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
76 * are physical addresses. The following constant pointer can be
77 * used as the IO-area pointer (it can be iounmapped as well, so the
78 * analogy with PCI is quite large):
79 */
80#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
81
82/*
83 * Cache management
84 *
85 * This needed for two cases
86 * 1. Out of order aware processors
87 * 2. Accidentally out of order processors (PPro errata #51)
88 */
89
90#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
91
92static inline void flush_write_buffers(void)
93{
94 asm volatile("lock; addl $0,0(%%esp)": : :"memory");
95}
96
97#else
98
99#define flush_write_buffers() do { } while (0)
100
101#endif
102
103#endif /* __KERNEL__ */
104
105extern void native_io_delay(void);
106
107extern int io_delay_type;
108extern void io_delay_init(void);
109
110#if defined(CONFIG_PARAVIRT)
111#include <asm/paravirt.h>
112#else
113
114static inline void slow_down_io(void)
115{
116 native_io_delay();
117#ifdef REALLY_SLOW_IO
118 native_io_delay();
119 native_io_delay();
120 native_io_delay();
121#endif
122}
123
124#endif
125
126#define __BUILDIO(bwl, bw, type) \
127static inline void out##bwl(unsigned type value, int port) \
128{ \
129 out##bwl##_local(value, port); \
130} \
131 \
132static inline unsigned type in##bwl(int port) \
133{ \
134 return in##bwl##_local(port); \
135}
136
137#define BUILDIO(bwl, bw, type) \
138static inline void out##bwl##_local(unsigned type value, int port) \
139{ \
140 asm volatile("out" #bwl " %" #bw "0, %w1" \
141 : : "a"(value), "Nd"(port)); \
142} \
143 \
144static inline unsigned type in##bwl##_local(int port) \
145{ \
146 unsigned type value; \
147 asm volatile("in" #bwl " %w1, %" #bw "0" \
148 : "=a"(value) : "Nd"(port)); \
149 return value; \
150} \
151 \
152static inline void out##bwl##_local_p(unsigned type value, int port) \
153{ \
154 out##bwl##_local(value, port); \
155 slow_down_io(); \
156} \
157 \
158static inline unsigned type in##bwl##_local_p(int port) \
159{ \
160 unsigned type value = in##bwl##_local(port); \
161 slow_down_io(); \
162 return value; \
163} \
164 \
165__BUILDIO(bwl, bw, type) \
166 \
167static inline void out##bwl##_p(unsigned type value, int port) \
168{ \
169 out##bwl(value, port); \
170 slow_down_io(); \
171} \
172 \
173static inline unsigned type in##bwl##_p(int port) \
174{ \
175 unsigned type value = in##bwl(port); \
176 slow_down_io(); \
177 return value; \
178} \
179 \
180static inline void outs##bwl(int port, const void *addr, unsigned long count) \
181{ \
182 asm volatile("rep; outs" #bwl \
183 : "+S"(addr), "+c"(count) : "d"(port)); \
184} \
185 \
186static inline void ins##bwl(int port, void *addr, unsigned long count) \
187{ \
188 asm volatile("rep; ins" #bwl \
189 : "+D"(addr), "+c"(count) : "d"(port)); \
190}
191
192BUILDIO(b, b, char)
193BUILDIO(w, w, short)
194BUILDIO(l, , int)
195
196#endif /* _ASM_X86_IO_32_H */
diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h
deleted file mode 100644
index 244067893af4..000000000000
--- a/arch/x86/include/asm/io_64.h
+++ /dev/null
@@ -1,181 +0,0 @@
1#ifndef _ASM_X86_IO_64_H
2#define _ASM_X86_IO_64_H
3
4
5/*
6 * This file contains the definitions for the x86 IO instructions
7 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
8 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
9 * versions of the single-IO instructions (inb_p/inw_p/..).
10 *
11 * This file is not meant to be obfuscating: it's just complicated
12 * to (a) handle it all in a way that makes gcc able to optimize it
13 * as well as possible and (b) trying to avoid writing the same thing
14 * over and over again with slight variations and possibly making a
15 * mistake somewhere.
16 */
17
18/*
19 * Thanks to James van Artsdalen for a better timing-fix than
20 * the two short jumps: using outb's to a nonexistent port seems
21 * to guarantee better timings even on fast machines.
22 *
23 * On the other hand, I'd like to be sure of a non-existent port:
24 * I feel a bit unsafe about using 0x80 (should be safe, though)
25 *
26 * Linus
27 */
28
29 /*
30 * Bit simplified and optimized by Jan Hubicka
31 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
32 *
33 * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
34 * isa_read[wl] and isa_write[wl] fixed
35 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
36 */
37
38extern void native_io_delay(void);
39
40extern int io_delay_type;
41extern void io_delay_init(void);
42
43#if defined(CONFIG_PARAVIRT)
44#include <asm/paravirt.h>
45#else
46
47static inline void slow_down_io(void)
48{
49 native_io_delay();
50#ifdef REALLY_SLOW_IO
51 native_io_delay();
52 native_io_delay();
53 native_io_delay();
54#endif
55}
56#endif
57
58/*
59 * Talk about misusing macros..
60 */
61#define __OUT1(s, x) \
62static inline void out##s(unsigned x value, unsigned short port) {
63
64#define __OUT2(s, s1, s2) \
65asm volatile ("out" #s " %" s1 "0,%" s2 "1"
66
67#ifndef REALLY_SLOW_IO
68#define REALLY_SLOW_IO
69#define UNSET_REALLY_SLOW_IO
70#endif
71
72#define __OUT(s, s1, x) \
73 __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
74 } \
75 __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
76 slow_down_io(); \
77}
78
79#define __IN1(s) \
80static inline RETURN_TYPE in##s(unsigned short port) \
81{ \
82 RETURN_TYPE _v;
83
84#define __IN2(s, s1, s2) \
85 asm volatile ("in" #s " %" s2 "1,%" s1 "0"
86
87#define __IN(s, s1, i...) \
88 __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
89 return _v; \
90 } \
91 __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
92 slow_down_io(); \
93 return _v; }
94
95#ifdef UNSET_REALLY_SLOW_IO
96#undef REALLY_SLOW_IO
97#endif
98
99#define __INS(s) \
100static inline void ins##s(unsigned short port, void *addr, \
101 unsigned long count) \
102{ \
103 asm volatile ("rep ; ins" #s \
104 : "=D" (addr), "=c" (count) \
105 : "d" (port), "0" (addr), "1" (count)); \
106}
107
108#define __OUTS(s) \
109static inline void outs##s(unsigned short port, const void *addr, \
110 unsigned long count) \
111{ \
112 asm volatile ("rep ; outs" #s \
113 : "=S" (addr), "=c" (count) \
114 : "d" (port), "0" (addr), "1" (count)); \
115}
116
117#define RETURN_TYPE unsigned char
118__IN(b, "")
119#undef RETURN_TYPE
120#define RETURN_TYPE unsigned short
121__IN(w, "")
122#undef RETURN_TYPE
123#define RETURN_TYPE unsigned int
124__IN(l, "")
125#undef RETURN_TYPE
126
127__OUT(b, "b", char)
128__OUT(w, "w", short)
129__OUT(l, , int)
130
131__INS(b)
132__INS(w)
133__INS(l)
134
135__OUTS(b)
136__OUTS(w)
137__OUTS(l)
138
139#if defined(__KERNEL__) && defined(__x86_64__)
140
141#include <linux/vmalloc.h>
142
143#include <asm-generic/iomap.h>
144
145void __memcpy_fromio(void *, unsigned long, unsigned);
146void __memcpy_toio(unsigned long, const void *, unsigned);
147
148static inline void memcpy_fromio(void *to, const volatile void __iomem *from,
149 unsigned len)
150{
151 __memcpy_fromio(to, (unsigned long)from, len);
152}
153
154static inline void memcpy_toio(volatile void __iomem *to, const void *from,
155 unsigned len)
156{
157 __memcpy_toio((unsigned long)to, from, len);
158}
159
160void memset_io(volatile void __iomem *a, int b, size_t c);
161
162/*
163 * ISA space is 'always mapped' on a typical x86 system, no need to
164 * explicitly ioremap() it. The fact that the ISA IO space is mapped
165 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
166 * are physical addresses. The following constant pointer can be
167 * used as the IO-area pointer (it can be iounmapped as well, so the
168 * analogy with PCI is quite large):
169 */
170#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
171
172#define flush_write_buffers()
173
174/*
175 * Convert a virtual cached pointer to an uncached pointer
176 */
177#define xlate_dev_kmem_ptr(p) p
178
179#endif /* __KERNEL__ */
180
181#endif /* _ASM_X86_IO_64_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 5f61f6e0ffdd..35832a03a515 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -143,8 +143,6 @@ extern int noioapicreroute;
143/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ 143/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
144extern int timer_through_8259; 144extern int timer_through_8259;
145 145
146extern void io_apic_disable_legacy(void);
147
148/* 146/*
149 * If we use the IO-APIC for IRQ routing, disable automatic 147 * If we use the IO-APIC for IRQ routing, disable automatic
150 * assignment of PCI IRQ's. 148 * assignment of PCI IRQ's.
@@ -189,6 +187,7 @@ extern struct mp_ioapic_gsi mp_gsi_routing[];
189int mp_find_ioapic(int gsi); 187int mp_find_ioapic(int gsi);
190int mp_find_ioapic_pin(int ioapic, int gsi); 188int mp_find_ioapic_pin(int ioapic, int gsi);
191void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); 189void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
190extern void __init pre_init_apic_IRQ0(void);
192 191
193#else /* !CONFIG_X86_IO_APIC */ 192#else /* !CONFIG_X86_IO_APIC */
194 193
@@ -198,7 +197,11 @@ static const int timer_through_8259 = 0;
198static inline void ioapic_init_mappings(void) { } 197static inline void ioapic_init_mappings(void) { }
199static inline void ioapic_insert_resources(void) { } 198static inline void ioapic_insert_resources(void) { }
200static inline void probe_nr_irqs_gsi(void) { } 199static inline void probe_nr_irqs_gsi(void) { }
200static inline int mp_find_ioapic(int gsi) { return 0; }
201 201
202struct io_apic_irq_attr;
203static inline int io_apic_set_pci_routing(struct device *dev, int irq,
204 struct io_apic_irq_attr *irq_attr) { return 0; }
202#endif 205#endif
203 206
204#endif /* _ASM_X86_IO_APIC_H */ 207#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 4611f085cd43..8767d99c4f64 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -28,28 +28,33 @@
28#define MCE_VECTOR 0x12 28#define MCE_VECTOR 0x12
29 29
30/* 30/*
31 * IDT vectors usable for external interrupt sources start 31 * IDT vectors usable for external interrupt sources start at 0x20.
32 * at 0x20: 32 * (0x80 is the syscall vector, 0x30-0x3f are for ISA)
33 */ 33 */
34#define FIRST_EXTERNAL_VECTOR 0x20 34#define FIRST_EXTERNAL_VECTOR 0x20
35 35/*
36#ifdef CONFIG_X86_32 36 * We start allocating at 0x21 to spread out vectors evenly between
37# define SYSCALL_VECTOR 0x80 37 * priority levels. (0x80 is the syscall vector)
38# define IA32_SYSCALL_VECTOR 0x80 38 */
39#else 39#define VECTOR_OFFSET_START 1
40# define IA32_SYSCALL_VECTOR 0x80
41#endif
42 40
43/* 41/*
44 * Reserve the lowest usable priority level 0x20 - 0x2f for triggering 42 * Reserve the lowest usable vector (and hence lowest priority) 0x20 for
45 * cleanup after irq migration. 43 * triggering cleanup after irq migration. 0x21-0x2f will still be used
44 * for device interrupts.
46 */ 45 */
47#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR 46#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
48 47
48#define IA32_SYSCALL_VECTOR 0x80
49#ifdef CONFIG_X86_32
50# define SYSCALL_VECTOR 0x80
51#endif
52
49/* 53/*
50 * Vectors 0x30-0x3f are used for ISA interrupts. 54 * Vectors 0x30-0x3f are used for ISA interrupts.
55 * round up to the next 16-vector boundary
51 */ 56 */
52#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) 57#define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15)
53 58
54#define IRQ1_VECTOR (IRQ0_VECTOR + 1) 59#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
55#define IRQ2_VECTOR (IRQ0_VECTOR + 2) 60#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
@@ -120,13 +125,6 @@
120 */ 125 */
121#define MCE_SELF_VECTOR 0xeb 126#define MCE_SELF_VECTOR 0xeb
122 127
123/*
124 * First APIC vector available to drivers: (vectors 0x30-0xee) we
125 * start at 0x31(0x41) to spread out vectors evenly between priority
126 * levels. (0x80 is the syscall vector)
127 */
128#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
129
130#define NR_VECTORS 256 128#define NR_VECTORS 256
131 129
132#define FPU_IRQ 13 130#define FPU_IRQ 13
@@ -154,21 +152,21 @@ static inline int invalid_vm86_irq(int irq)
154 152
155#define NR_IRQS_LEGACY 16 153#define NR_IRQS_LEGACY 16
156 154
157#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS )
158#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) 155#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS )
159 156
160#ifdef CONFIG_X86_IO_APIC 157#ifdef CONFIG_X86_IO_APIC
161# ifdef CONFIG_SPARSE_IRQ 158# ifdef CONFIG_SPARSE_IRQ
159# define CPU_VECTOR_LIMIT (64 * NR_CPUS)
162# define NR_IRQS \ 160# define NR_IRQS \
163 (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ 161 (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \
164 (NR_VECTORS + CPU_VECTOR_LIMIT) : \ 162 (NR_VECTORS + CPU_VECTOR_LIMIT) : \
165 (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) 163 (NR_VECTORS + IO_APIC_VECTOR_LIMIT))
166# else 164# else
167# if NR_CPUS < MAX_IO_APICS 165# define CPU_VECTOR_LIMIT (32 * NR_CPUS)
168# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT) 166# define NR_IRQS \
169# else 167 (CPU_VECTOR_LIMIT < IO_APIC_VECTOR_LIMIT ? \
170# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) 168 (NR_VECTORS + CPU_VECTOR_LIMIT) : \
171# endif 169 (NR_VECTORS + IO_APIC_VECTOR_LIMIT))
172# endif 170# endif
173#else /* !CONFIG_X86_IO_APIC: */ 171#else /* !CONFIG_X86_IO_APIC: */
174# define NR_IRQS NR_IRQS_LEGACY 172# define NR_IRQS NR_IRQS_LEGACY
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 4fe681de1e76..4ffa345a8ccb 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -32,7 +32,10 @@ struct kprobe;
32 32
33typedef u8 kprobe_opcode_t; 33typedef u8 kprobe_opcode_t;
34#define BREAKPOINT_INSTRUCTION 0xcc 34#define BREAKPOINT_INSTRUCTION 0xcc
35#define RELATIVEJUMP_INSTRUCTION 0xe9 35#define RELATIVEJUMP_OPCODE 0xe9
36#define RELATIVEJUMP_SIZE 5
37#define RELATIVECALL_OPCODE 0xe8
38#define RELATIVE_ADDR_SIZE 4
36#define MAX_INSN_SIZE 16 39#define MAX_INSN_SIZE 16
37#define MAX_STACK_SIZE 64 40#define MAX_STACK_SIZE 64
38#define MIN_STACK_SIZE(ADDR) \ 41#define MIN_STACK_SIZE(ADDR) \
@@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t;
44 47
45#define flush_insn_slot(p) do { } while (0) 48#define flush_insn_slot(p) do { } while (0)
46 49
50/* optinsn template addresses */
51extern kprobe_opcode_t optprobe_template_entry;
52extern kprobe_opcode_t optprobe_template_val;
53extern kprobe_opcode_t optprobe_template_call;
54extern kprobe_opcode_t optprobe_template_end;
55#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE)
56#define MAX_OPTINSN_SIZE \
57 (((unsigned long)&optprobe_template_end - \
58 (unsigned long)&optprobe_template_entry) + \
59 MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE)
60
47extern const int kretprobe_blacklist_size; 61extern const int kretprobe_blacklist_size;
48 62
49void arch_remove_kprobe(struct kprobe *p); 63void arch_remove_kprobe(struct kprobe *p);
@@ -64,6 +78,21 @@ struct arch_specific_insn {
64 int boostable; 78 int boostable;
65}; 79};
66 80
81struct arch_optimized_insn {
82 /* copy of the original instructions */
83 kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE];
84 /* detour code buffer */
85 kprobe_opcode_t *insn;
86 /* the size of instructions copied to detour code buffer */
87 size_t size;
88};
89
90/* Return true (!0) if optinsn is prepared for optimization. */
91static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
92{
93 return optinsn->size;
94}
95
67struct prev_kprobe { 96struct prev_kprobe {
68 struct kprobe *kp; 97 struct kprobe *kp;
69 unsigned long status; 98 unsigned long status;
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7c18e1230f54..7a6f54fa13ba 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -54,13 +54,23 @@ struct x86_emulate_ctxt;
54struct x86_emulate_ops { 54struct x86_emulate_ops {
55 /* 55 /*
56 * read_std: Read bytes of standard (non-emulated/special) memory. 56 * read_std: Read bytes of standard (non-emulated/special) memory.
57 * Used for instruction fetch, stack operations, and others. 57 * Used for descriptor reading.
58 * @addr: [IN ] Linear address from which to read. 58 * @addr: [IN ] Linear address from which to read.
59 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 59 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
60 * @bytes: [IN ] Number of bytes to read from memory. 60 * @bytes: [IN ] Number of bytes to read from memory.
61 */ 61 */
62 int (*read_std)(unsigned long addr, void *val, 62 int (*read_std)(unsigned long addr, void *val,
63 unsigned int bytes, struct kvm_vcpu *vcpu); 63 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
64
65 /*
66 * fetch: Read bytes of standard (non-emulated/special) memory.
67 * Used for instruction fetch.
68 * @addr: [IN ] Linear address from which to read.
69 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
70 * @bytes: [IN ] Number of bytes to read from memory.
71 */
72 int (*fetch)(unsigned long addr, void *val,
73 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
64 74
65 /* 75 /*
66 * read_emulated: Read bytes from emulated/special memory area. 76 * read_emulated: Read bytes from emulated/special memory area.
@@ -74,7 +84,7 @@ struct x86_emulate_ops {
74 struct kvm_vcpu *vcpu); 84 struct kvm_vcpu *vcpu);
75 85
76 /* 86 /*
77 * write_emulated: Read bytes from emulated/special memory area. 87 * write_emulated: Write bytes to emulated/special memory area.
78 * @addr: [IN ] Linear address to which to write. 88 * @addr: [IN ] Linear address to which to write.
79 * @val: [IN ] Value to write to memory (low-order bytes used as 89 * @val: [IN ] Value to write to memory (low-order bytes used as
80 * required). 90 * required).
@@ -168,6 +178,7 @@ struct x86_emulate_ctxt {
168 178
169/* Execution mode, passed to the emulator. */ 179/* Execution mode, passed to the emulator. */
170#define X86EMUL_MODE_REAL 0 /* Real mode. */ 180#define X86EMUL_MODE_REAL 0 /* Real mode. */
181#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */
171#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ 182#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
172#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ 183#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
173#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ 184#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4f865e8b8540..06d9e79ca37d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -25,7 +25,7 @@
25#include <asm/mtrr.h> 25#include <asm/mtrr.h>
26#include <asm/msr-index.h> 26#include <asm/msr-index.h>
27 27
28#define KVM_MAX_VCPUS 16 28#define KVM_MAX_VCPUS 64
29#define KVM_MEMORY_SLOTS 32 29#define KVM_MEMORY_SLOTS 32
30/* memory slots that does not exposed to userspace */ 30/* memory slots that does not exposed to userspace */
31#define KVM_PRIVATE_MEM_SLOTS 4 31#define KVM_PRIVATE_MEM_SLOTS 4
@@ -38,19 +38,6 @@
38#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 38#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
39 0xFFFFFF0000000000ULL) 39 0xFFFFFF0000000000ULL)
40 40
41#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
42 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
43#define KVM_GUEST_CR0_MASK \
44 (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
45#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
46 (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
47#define KVM_VM_CR0_ALWAYS_ON \
48 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
49#define KVM_GUEST_CR4_MASK \
50 (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
51#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
52#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
53
54#define INVALID_PAGE (~(hpa_t)0) 41#define INVALID_PAGE (~(hpa_t)0)
55#define UNMAPPED_GVA (~(gpa_t)0) 42#define UNMAPPED_GVA (~(gpa_t)0)
56 43
@@ -256,7 +243,8 @@ struct kvm_mmu {
256 void (*new_cr3)(struct kvm_vcpu *vcpu); 243 void (*new_cr3)(struct kvm_vcpu *vcpu);
257 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); 244 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
258 void (*free)(struct kvm_vcpu *vcpu); 245 void (*free)(struct kvm_vcpu *vcpu);
259 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); 246 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
247 u32 *error);
260 void (*prefetch_page)(struct kvm_vcpu *vcpu, 248 void (*prefetch_page)(struct kvm_vcpu *vcpu,
261 struct kvm_mmu_page *page); 249 struct kvm_mmu_page *page);
262 int (*sync_page)(struct kvm_vcpu *vcpu, 250 int (*sync_page)(struct kvm_vcpu *vcpu,
@@ -282,13 +270,15 @@ struct kvm_vcpu_arch {
282 u32 regs_dirty; 270 u32 regs_dirty;
283 271
284 unsigned long cr0; 272 unsigned long cr0;
273 unsigned long cr0_guest_owned_bits;
285 unsigned long cr2; 274 unsigned long cr2;
286 unsigned long cr3; 275 unsigned long cr3;
287 unsigned long cr4; 276 unsigned long cr4;
277 unsigned long cr4_guest_owned_bits;
288 unsigned long cr8; 278 unsigned long cr8;
289 u32 hflags; 279 u32 hflags;
290 u64 pdptrs[4]; /* pae */ 280 u64 pdptrs[4]; /* pae */
291 u64 shadow_efer; 281 u64 efer;
292 u64 apic_base; 282 u64 apic_base;
293 struct kvm_lapic *apic; /* kernel irqchip context */ 283 struct kvm_lapic *apic; /* kernel irqchip context */
294 int32_t apic_arb_prio; 284 int32_t apic_arb_prio;
@@ -374,17 +364,27 @@ struct kvm_vcpu_arch {
374 /* used for guest single stepping over the given code position */ 364 /* used for guest single stepping over the given code position */
375 u16 singlestep_cs; 365 u16 singlestep_cs;
376 unsigned long singlestep_rip; 366 unsigned long singlestep_rip;
367 /* fields used by HYPER-V emulation */
368 u64 hv_vapic;
377}; 369};
378 370
379struct kvm_mem_alias { 371struct kvm_mem_alias {
380 gfn_t base_gfn; 372 gfn_t base_gfn;
381 unsigned long npages; 373 unsigned long npages;
382 gfn_t target_gfn; 374 gfn_t target_gfn;
375#define KVM_ALIAS_INVALID 1UL
376 unsigned long flags;
383}; 377};
384 378
385struct kvm_arch{ 379#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
386 int naliases; 380
381struct kvm_mem_aliases {
387 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; 382 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
383 int naliases;
384};
385
386struct kvm_arch {
387 struct kvm_mem_aliases *aliases;
388 388
389 unsigned int n_free_mmu_pages; 389 unsigned int n_free_mmu_pages;
390 unsigned int n_requested_mmu_pages; 390 unsigned int n_requested_mmu_pages;
@@ -416,6 +416,10 @@ struct kvm_arch{
416 s64 kvmclock_offset; 416 s64 kvmclock_offset;
417 417
418 struct kvm_xen_hvm_config xen_hvm_config; 418 struct kvm_xen_hvm_config xen_hvm_config;
419
420 /* fields used by HYPER-V emulation */
421 u64 hv_guest_os_id;
422 u64 hv_hypercall;
419}; 423};
420 424
421struct kvm_vm_stat { 425struct kvm_vm_stat {
@@ -471,6 +475,7 @@ struct kvm_x86_ops {
471 int (*hardware_setup)(void); /* __init */ 475 int (*hardware_setup)(void); /* __init */
472 void (*hardware_unsetup)(void); /* __exit */ 476 void (*hardware_unsetup)(void); /* __exit */
473 bool (*cpu_has_accelerated_tpr)(void); 477 bool (*cpu_has_accelerated_tpr)(void);
478 void (*cpuid_update)(struct kvm_vcpu *vcpu);
474 479
475 /* Create, but do not attach this VCPU */ 480 /* Create, but do not attach this VCPU */
476 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); 481 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
@@ -492,6 +497,7 @@ struct kvm_x86_ops {
492 void (*set_segment)(struct kvm_vcpu *vcpu, 497 void (*set_segment)(struct kvm_vcpu *vcpu,
493 struct kvm_segment *var, int seg); 498 struct kvm_segment *var, int seg);
494 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); 499 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
500 void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
495 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); 501 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
496 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); 502 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
497 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 503 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -501,12 +507,13 @@ struct kvm_x86_ops {
501 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 507 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
502 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 508 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
503 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 509 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
504 unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); 510 int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
505 void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, 511 int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
506 int *exception);
507 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); 512 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
508 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); 513 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
509 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 514 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
515 void (*fpu_activate)(struct kvm_vcpu *vcpu);
516 void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
510 517
511 void (*tlb_flush)(struct kvm_vcpu *vcpu); 518 void (*tlb_flush)(struct kvm_vcpu *vcpu);
512 519
@@ -531,7 +538,8 @@ struct kvm_x86_ops {
531 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 538 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
532 int (*get_tdp_level)(void); 539 int (*get_tdp_level)(void);
533 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 540 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
534 bool (*gb_page_enable)(void); 541 int (*get_lpage_level)(void);
542 bool (*rdtscp_supported)(void);
535 543
536 const struct trace_print_flags *exit_reasons_str; 544 const struct trace_print_flags *exit_reasons_str;
537}; 545};
@@ -606,8 +614,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
606 unsigned long value); 614 unsigned long value);
607 615
608void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 616void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
609int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 617int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
610 int type_bits, int seg);
611 618
612int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); 619int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
613 620
@@ -653,6 +660,10 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
653int kvm_mmu_load(struct kvm_vcpu *vcpu); 660int kvm_mmu_load(struct kvm_vcpu *vcpu);
654void kvm_mmu_unload(struct kvm_vcpu *vcpu); 661void kvm_mmu_unload(struct kvm_vcpu *vcpu);
655void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 662void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
663gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
664gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
665gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
666gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
656 667
657int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 668int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
658 669
@@ -666,6 +677,7 @@ void kvm_disable_tdp(void);
666 677
667int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); 678int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
668int complete_pio(struct kvm_vcpu *vcpu); 679int complete_pio(struct kvm_vcpu *vcpu);
680bool kvm_check_iopl(struct kvm_vcpu *vcpu);
669 681
670struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); 682struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
671 683
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c584076a47f4..ffae1420e7d7 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_KVM_PARA_H 2#define _ASM_X86_KVM_PARA_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <asm/hyperv.h>
5 6
6/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It 7/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
7 * should be used to determine that a VM is running under KVM. 8 * should be used to determine that a VM is running under KVM.
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 47b9b6f19057..2e9972468a5d 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -195,41 +195,4 @@ static inline long local_sub_return(long i, local_t *l)
195#define __local_add(i, l) local_add((i), (l)) 195#define __local_add(i, l) local_add((i), (l))
196#define __local_sub(i, l) local_sub((i), (l)) 196#define __local_sub(i, l) local_sub((i), (l))
197 197
198/* Use these for per-cpu local_t variables: on some archs they are
199 * much more efficient than these naive implementations. Note they take
200 * a variable, not an address.
201 *
202 * X86_64: This could be done better if we moved the per cpu data directly
203 * after GS.
204 */
205
206/* Need to disable preemption for the cpu local counters otherwise we could
207 still access a variable of a previous CPU in a non atomic way. */
208#define cpu_local_wrap_v(l) \
209({ \
210 local_t res__; \
211 preempt_disable(); \
212 res__ = (l); \
213 preempt_enable(); \
214 res__; \
215})
216#define cpu_local_wrap(l) \
217({ \
218 preempt_disable(); \
219 (l); \
220 preempt_enable(); \
221}) \
222
223#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var((l))))
224#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var((l)), (i)))
225#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var((l))))
226#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var((l))))
227#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var((l))))
228#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var((l))))
229
230#define __cpu_local_inc(l) cpu_local_inc((l))
231#define __cpu_local_dec(l) cpu_local_dec((l))
232#define __cpu_local_add(i, l) cpu_local_add((i), (l))
233#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
234
235#endif /* _ASM_X86_LOCAL_H */ 198#endif /* _ASM_X86_LOCAL_H */
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index a29f48c2a322..288b96f815a6 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -39,11 +39,5 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
40#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ 40#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
41 NODE_DATA(nid)->node_spanned_pages) 41 NODE_DATA(nid)->node_spanned_pages)
42
43#ifdef CONFIG_NUMA_EMU
44#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024)
45#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
46#endif
47
48#endif 42#endif
49#endif /* _ASM_X86_MMZONE_64_H */ 43#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
new file mode 100644
index 000000000000..451d30e7f62d
--- /dev/null
+++ b/arch/x86/include/asm/mrst.h
@@ -0,0 +1,19 @@
1/*
2 * mrst.h: Intel Moorestown platform specific setup code
3 *
4 * (C) Copyright 2009 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; version 2
9 * of the License.
10 */
11#ifndef _ASM_X86_MRST_H
12#define _ASM_X86_MRST_H
13extern int pci_mrst_init(void);
14int __init sfi_parse_mrtc(struct sfi_table_header *table);
15
16#define SFI_MTMR_MAX_NUM 8
17#define SFI_MRTC_MAX 8
18
19#endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 139d4c1a33a7..93da9c3f3341 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -19,7 +19,6 @@ extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
19extern int check_nmi_watchdog(void); 19extern int check_nmi_watchdog(void);
20extern int nmi_watchdog_enabled; 20extern int nmi_watchdog_enabled;
21extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); 21extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
22extern int avail_to_resrv_perfctr_nmi(unsigned int);
23extern int reserve_perfctr_nmi(unsigned int); 22extern int reserve_perfctr_nmi(unsigned int);
24extern void release_perfctr_nmi(unsigned int); 23extern void release_perfctr_nmi(unsigned int);
25extern int reserve_evntsel_nmi(unsigned int); 24extern int reserve_evntsel_nmi(unsigned int);
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index c4ae822e415f..823e070e7c26 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -36,6 +36,11 @@ extern void __cpuinit numa_set_node(int cpu, int node);
36extern void __cpuinit numa_clear_node(int cpu); 36extern void __cpuinit numa_clear_node(int cpu);
37extern void __cpuinit numa_add_cpu(int cpu); 37extern void __cpuinit numa_add_cpu(int cpu);
38extern void __cpuinit numa_remove_cpu(int cpu); 38extern void __cpuinit numa_remove_cpu(int cpu);
39
40#ifdef CONFIG_NUMA_EMU
41#define FAKE_NODE_MIN_SIZE ((u64)64 << 20)
42#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
43#endif /* CONFIG_NUMA_EMU */
39#else 44#else
40static inline void init_cpu_to_node(void) { } 45static inline void init_cpu_to_node(void) { }
41static inline void numa_set_node(int cpu, int node) { } 46static inline void numa_set_node(int cpu, int node) { }
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
index 9f0a5f5d29ec..37c516545ec8 100644
--- a/arch/x86/include/asm/numaq.h
+++ b/arch/x86/include/asm/numaq.h
@@ -30,9 +30,14 @@
30 30
31extern int found_numaq; 31extern int found_numaq;
32extern int get_memcfg_numaq(void); 32extern int get_memcfg_numaq(void);
33extern int pci_numaq_init(void);
33 34
34extern void *xquad_portio; 35extern void *xquad_portio;
35 36
37#define XQUAD_PORTIO_BASE 0xfe400000
38#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
39#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
40
36/* 41/*
37 * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the 42 * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the
38 */ 43 */
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 3a57385d9fa7..101229b0d8ed 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -13,7 +13,6 @@ struct olpc_platform_t {
13 13
14#define OLPC_F_PRESENT 0x01 14#define OLPC_F_PRESENT 0x01
15#define OLPC_F_DCON 0x02 15#define OLPC_F_DCON 0x02
16#define OLPC_F_VSA 0x04
17 16
18#ifdef CONFIG_OLPC 17#ifdef CONFIG_OLPC
19 18
@@ -51,18 +50,6 @@ static inline int olpc_has_dcon(void)
51} 50}
52 51
53/* 52/*
54 * The VSA is software from AMD that typical Geode bioses will include.
55 * It is used to emulate the PCI bus, VGA, etc. OLPC's Open Firmware does
56 * not include the VSA; instead, PCI is emulated by the kernel.
57 *
58 * The VSA is described further in arch/x86/pci/olpc.c.
59 */
60static inline int olpc_has_vsa(void)
61{
62 return (olpc_platform_info.flags & OLPC_F_VSA) ? 1 : 0;
63}
64
65/*
66 * The "Mass Production" version of OLPC's XO is identified as being model 53 * The "Mass Production" version of OLPC's XO is identified as being model
67 * C2. During the prototype phase, the following models (in chronological 54 * C2. During the prototype phase, the following models (in chronological
68 * order) were created: A1, B1, B2, B3, B4, C1. The A1 through B2 models 55 * order) were created: A1, B1, B2, B3, B4, C1. The A1 through B2 models
@@ -87,13 +74,10 @@ static inline int olpc_has_dcon(void)
87 return 0; 74 return 0;
88} 75}
89 76
90static inline int olpc_has_vsa(void)
91{
92 return 0;
93}
94
95#endif 77#endif
96 78
79extern int pci_olpc_init(void);
80
97/* EC related functions */ 81/* EC related functions */
98 82
99extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, 83extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 642fe34b36a2..a667f24c7254 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,7 +40,6 @@
40 40
41#ifndef __ASSEMBLY__ 41#ifndef __ASSEMBLY__
42 42
43extern int page_is_ram(unsigned long pagenr);
44extern int devmem_is_allowed(unsigned long pagenr); 43extern int devmem_is_allowed(unsigned long pagenr);
45 44
46extern unsigned long max_low_pfn_mapped; 45extern unsigned long max_low_pfn_mapped;
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index dd59a85a918f..5653f43d90e5 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -435,15 +435,6 @@ static inline void paravirt_release_pud(unsigned long pfn)
435 PVOP_VCALL1(pv_mmu_ops.release_pud, pfn); 435 PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
436} 436}
437 437
438#ifdef CONFIG_HIGHPTE
439static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
440{
441 unsigned long ret;
442 ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
443 return (void *)ret;
444}
445#endif
446
447static inline void pte_update(struct mm_struct *mm, unsigned long addr, 438static inline void pte_update(struct mm_struct *mm, unsigned long addr,
448 pte_t *ptep) 439 pte_t *ptep)
449{ 440{
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b1e70d51e40c..db9ef5532341 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -304,10 +304,6 @@ struct pv_mmu_ops {
304#endif /* PAGETABLE_LEVELS == 4 */ 304#endif /* PAGETABLE_LEVELS == 4 */
305#endif /* PAGETABLE_LEVELS >= 3 */ 305#endif /* PAGETABLE_LEVELS >= 3 */
306 306
307#ifdef CONFIG_HIGHPTE
308 void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
309#endif
310
311 struct pv_lazy_ops lazy_mode; 307 struct pv_lazy_ops lazy_mode;
312 308
313 /* dom0 ops */ 309 /* dom0 ops */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index ada8c201d513..404a880ea325 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -45,8 +45,15 @@ static inline int pci_proc_domain(struct pci_bus *bus)
45 45
46#ifdef CONFIG_PCI 46#ifdef CONFIG_PCI
47extern unsigned int pcibios_assign_all_busses(void); 47extern unsigned int pcibios_assign_all_busses(void);
48extern int pci_legacy_init(void);
49# ifdef CONFIG_ACPI
50# define x86_default_pci_init pci_acpi_init
51# else
52# define x86_default_pci_init pci_legacy_init
53# endif
48#else 54#else
49#define pcibios_assign_all_busses() 0 55# define pcibios_assign_all_busses() 0
56# define x86_default_pci_init NULL
50#endif 57#endif
51 58
52extern unsigned long pci_mem_start; 59extern unsigned long pci_mem_start;
@@ -90,40 +97,14 @@ extern void pci_iommu_alloc(void);
90 97
91#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) 98#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
92 99
93#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG)
94
95#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
96 dma_addr_t ADDR_NAME;
97#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
98 __u32 LEN_NAME;
99#define pci_unmap_addr(PTR, ADDR_NAME) \
100 ((PTR)->ADDR_NAME)
101#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
102 (((PTR)->ADDR_NAME) = (VAL))
103#define pci_unmap_len(PTR, LEN_NAME) \
104 ((PTR)->LEN_NAME)
105#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
106 (((PTR)->LEN_NAME) = (VAL))
107
108#else
109
110#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
111#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
112#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
113#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
114 do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
115#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
116#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
117 do { break; } while (pci_unmap_len(PTR, LEN_NAME))
118
119#endif
120
121#endif /* __KERNEL__ */ 100#endif /* __KERNEL__ */
122 101
123#ifdef CONFIG_X86_64 102#ifdef CONFIG_X86_64
124#include "pci_64.h" 103#include "pci_64.h"
125#endif 104#endif
126 105
106void dma32_reserve_bootmem(void);
107
127/* implement the pci_ DMA API in terms of the generic device dma_ one */ 108/* implement the pci_ DMA API in terms of the generic device dma_ one */
128#include <asm-generic/pci-dma-compat.h> 109#include <asm-generic/pci-dma-compat.h>
129 110
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
index ae5e40f67daf..fe15cfb21b9b 100644
--- a/arch/x86/include/asm/pci_64.h
+++ b/arch/x86/include/asm/pci_64.h
@@ -22,8 +22,6 @@ extern int (*pci_config_read)(int seg, int bus, int dev, int fn,
22extern int (*pci_config_write)(int seg, int bus, int dev, int fn, 22extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
23 int reg, int len, u32 value); 23 int reg, int len, u32 value);
24 24
25extern void dma32_reserve_bootmem(void);
26
27#endif /* __KERNEL__ */ 25#endif /* __KERNEL__ */
28 26
29#endif /* _ASM_X86_PCI_64_H */ 27#endif /* _ASM_X86_PCI_64_H */
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index b4bf9a942ed0..1a0422348d6d 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -29,6 +29,7 @@
29#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000 29#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000
30#define PCI_HAS_IO_ECS 0x40000 30#define PCI_HAS_IO_ECS 0x40000
31#define PCI_NOASSIGN_ROMS 0x80000 31#define PCI_NOASSIGN_ROMS 0x80000
32#define PCI_ROOT_NO_CRS 0x100000
32 33
33extern unsigned int pci_probe; 34extern unsigned int pci_probe;
34extern unsigned long pirq_table_addr; 35extern unsigned long pirq_table_addr;
@@ -82,7 +83,6 @@ struct irq_routing_table {
82 83
83extern unsigned int pcibios_irq_mask; 84extern unsigned int pcibios_irq_mask;
84 85
85extern int pcibios_scanned;
86extern spinlock_t pci_config_lock; 86extern spinlock_t pci_config_lock;
87 87
88extern int (*pcibios_enable_irq)(struct pci_dev *dev); 88extern int (*pcibios_enable_irq)(struct pci_dev *dev);
@@ -105,16 +105,15 @@ extern bool port_cf9_safe;
105extern int pci_direct_probe(void); 105extern int pci_direct_probe(void);
106extern void pci_direct_init(int type); 106extern void pci_direct_init(int type);
107extern void pci_pcbios_init(void); 107extern void pci_pcbios_init(void);
108extern int pci_olpc_init(void);
109extern void __init dmi_check_pciprobe(void); 108extern void __init dmi_check_pciprobe(void);
110extern void __init dmi_check_skip_isa_align(void); 109extern void __init dmi_check_skip_isa_align(void);
111 110
112/* some common used subsys_initcalls */ 111/* some common used subsys_initcalls */
113extern int __init pci_acpi_init(void); 112extern int __init pci_acpi_init(void);
114extern int __init pcibios_irq_init(void); 113extern void __init pcibios_irq_init(void);
115extern int __init pci_visws_init(void);
116extern int __init pci_numaq_init(void);
117extern int __init pcibios_init(void); 114extern int __init pcibios_init(void);
115extern int pci_legacy_init(void);
116extern void pcibios_fixup_irqs(void);
118 117
119/* pci-mmconfig.c */ 118/* pci-mmconfig.c */
120 119
@@ -182,3 +181,17 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val)
182{ 181{
183 asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory"); 182 asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory");
184} 183}
184
185#ifdef CONFIG_PCI
186# ifdef CONFIG_ACPI
187# define x86_default_pci_init pci_acpi_init
188# else
189# define x86_default_pci_init pci_legacy_init
190# endif
191# define x86_default_pci_init_irq pcibios_irq_init
192# define x86_default_pci_fixup_irqs pcibios_fixup_irqs
193#else
194# define x86_default_pci_init NULL
195# define x86_default_pci_init_irq NULL
196# define x86_default_pci_fixup_irqs NULL
197#endif
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 0c44196b78ac..66a272dfd8b8 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -25,19 +25,18 @@
25 */ 25 */
26#ifdef CONFIG_SMP 26#ifdef CONFIG_SMP
27#define PER_CPU(var, reg) \ 27#define PER_CPU(var, reg) \
28 __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg; \ 28 __percpu_mov_op %__percpu_seg:this_cpu_off, reg; \
29 lea per_cpu__##var(reg), reg 29 lea var(reg), reg
30#define PER_CPU_VAR(var) %__percpu_seg:per_cpu__##var 30#define PER_CPU_VAR(var) %__percpu_seg:var
31#else /* ! SMP */ 31#else /* ! SMP */
32#define PER_CPU(var, reg) \ 32#define PER_CPU(var, reg) __percpu_mov_op $var, reg
33 __percpu_mov_op $per_cpu__##var, reg 33#define PER_CPU_VAR(var) var
34#define PER_CPU_VAR(var) per_cpu__##var
35#endif /* SMP */ 34#endif /* SMP */
36 35
37#ifdef CONFIG_X86_64_SMP 36#ifdef CONFIG_X86_64_SMP
38#define INIT_PER_CPU_VAR(var) init_per_cpu__##var 37#define INIT_PER_CPU_VAR(var) init_per_cpu__##var
39#else 38#else
40#define INIT_PER_CPU_VAR(var) per_cpu__##var 39#define INIT_PER_CPU_VAR(var) var
41#endif 40#endif
42 41
43#else /* ...!ASSEMBLY */ 42#else /* ...!ASSEMBLY */
@@ -60,12 +59,12 @@
60 * There also must be an entry in vmlinux_64.lds.S 59 * There also must be an entry in vmlinux_64.lds.S
61 */ 60 */
62#define DECLARE_INIT_PER_CPU(var) \ 61#define DECLARE_INIT_PER_CPU(var) \
63 extern typeof(per_cpu_var(var)) init_per_cpu_var(var) 62 extern typeof(var) init_per_cpu_var(var)
64 63
65#ifdef CONFIG_X86_64_SMP 64#ifdef CONFIG_X86_64_SMP
66#define init_per_cpu_var(var) init_per_cpu__##var 65#define init_per_cpu_var(var) init_per_cpu__##var
67#else 66#else
68#define init_per_cpu_var(var) per_cpu_var(var) 67#define init_per_cpu_var(var) var
69#endif 68#endif
70 69
71/* For arch-specific code, we can use direct single-insn ops (they 70/* For arch-specific code, we can use direct single-insn ops (they
@@ -104,6 +103,64 @@ do { \
104 } \ 103 } \
105} while (0) 104} while (0)
106 105
106/*
107 * Generate a percpu add to memory instruction and optimize code
108 * if a one is added or subtracted.
109 */
110#define percpu_add_op(var, val) \
111do { \
112 typedef typeof(var) pao_T__; \
113 const int pao_ID__ = (__builtin_constant_p(val) && \
114 ((val) == 1 || (val) == -1)) ? (val) : 0; \
115 if (0) { \
116 pao_T__ pao_tmp__; \
117 pao_tmp__ = (val); \
118 } \
119 switch (sizeof(var)) { \
120 case 1: \
121 if (pao_ID__ == 1) \
122 asm("incb "__percpu_arg(0) : "+m" (var)); \
123 else if (pao_ID__ == -1) \
124 asm("decb "__percpu_arg(0) : "+m" (var)); \
125 else \
126 asm("addb %1, "__percpu_arg(0) \
127 : "+m" (var) \
128 : "qi" ((pao_T__)(val))); \
129 break; \
130 case 2: \
131 if (pao_ID__ == 1) \
132 asm("incw "__percpu_arg(0) : "+m" (var)); \
133 else if (pao_ID__ == -1) \
134 asm("decw "__percpu_arg(0) : "+m" (var)); \
135 else \
136 asm("addw %1, "__percpu_arg(0) \
137 : "+m" (var) \
138 : "ri" ((pao_T__)(val))); \
139 break; \
140 case 4: \
141 if (pao_ID__ == 1) \
142 asm("incl "__percpu_arg(0) : "+m" (var)); \
143 else if (pao_ID__ == -1) \
144 asm("decl "__percpu_arg(0) : "+m" (var)); \
145 else \
146 asm("addl %1, "__percpu_arg(0) \
147 : "+m" (var) \
148 : "ri" ((pao_T__)(val))); \
149 break; \
150 case 8: \
151 if (pao_ID__ == 1) \
152 asm("incq "__percpu_arg(0) : "+m" (var)); \
153 else if (pao_ID__ == -1) \
154 asm("decq "__percpu_arg(0) : "+m" (var)); \
155 else \
156 asm("addq %1, "__percpu_arg(0) \
157 : "+m" (var) \
158 : "re" ((pao_T__)(val))); \
159 break; \
160 default: __bad_percpu_size(); \
161 } \
162} while (0)
163
107#define percpu_from_op(op, var, constraint) \ 164#define percpu_from_op(op, var, constraint) \
108({ \ 165({ \
109 typeof(var) pfo_ret__; \ 166 typeof(var) pfo_ret__; \
@@ -142,16 +199,14 @@ do { \
142 * per-thread variables implemented as per-cpu variables and thus 199 * per-thread variables implemented as per-cpu variables and thus
143 * stable for the duration of the respective task. 200 * stable for the duration of the respective task.
144 */ 201 */
145#define percpu_read(var) percpu_from_op("mov", per_cpu__##var, \ 202#define percpu_read(var) percpu_from_op("mov", var, "m" (var))
146 "m" (per_cpu__##var)) 203#define percpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var)))
147#define percpu_read_stable(var) percpu_from_op("mov", per_cpu__##var, \ 204#define percpu_write(var, val) percpu_to_op("mov", var, val)
148 "p" (&per_cpu__##var)) 205#define percpu_add(var, val) percpu_add_op(var, val)
149#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val) 206#define percpu_sub(var, val) percpu_add_op(var, -(val))
150#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val) 207#define percpu_and(var, val) percpu_to_op("and", var, val)
151#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val) 208#define percpu_or(var, val) percpu_to_op("or", var, val)
152#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val) 209#define percpu_xor(var, val) percpu_to_op("xor", var, val)
153#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val)
154#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val)
155 210
156#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 211#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
157#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 212#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
@@ -160,9 +215,9 @@ do { \
160#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) 215#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
161#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) 216#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
162#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) 217#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
163#define __this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) 218#define __this_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
164#define __this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) 219#define __this_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
165#define __this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) 220#define __this_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
166#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) 221#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
167#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) 222#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
168#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) 223#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
@@ -179,9 +234,9 @@ do { \
179#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) 234#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
180#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) 235#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
181#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) 236#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
182#define this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) 237#define this_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
183#define this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) 238#define this_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
184#define this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) 239#define this_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
185#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) 240#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
186#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) 241#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
187#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) 242#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
@@ -192,9 +247,9 @@ do { \
192#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) 247#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
193#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) 248#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
194 249
195#define irqsafe_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) 250#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
196#define irqsafe_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) 251#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
197#define irqsafe_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) 252#define irqsafe_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
198#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) 253#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
199#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) 254#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
200#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) 255#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
@@ -212,19 +267,19 @@ do { \
212#ifdef CONFIG_X86_64 267#ifdef CONFIG_X86_64
213#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 268#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
214#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) 269#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
215#define __this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) 270#define __this_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
216#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 271#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
217#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 272#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
218#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 273#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
219 274
220#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 275#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
221#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) 276#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
222#define this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) 277#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
223#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 278#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
224#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 279#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
225#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 280#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
226 281
227#define irqsafe_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) 282#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
228#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 283#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
229#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 284#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
230#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 285#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
@@ -236,7 +291,7 @@ do { \
236({ \ 291({ \
237 int old__; \ 292 int old__; \
238 asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \ 293 asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \
239 : "=r" (old__), "+m" (per_cpu__##var) \ 294 : "=r" (old__), "+m" (var) \
240 : "dIr" (bit)); \ 295 : "dIr" (bit)); \
241 old__; \ 296 old__; \
242}) 297})
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 1380367dabd9..db6109a885a7 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -18,7 +18,7 @@
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20 20
21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) 21#define ARCH_PERFMON_EVENTSEL_ENABLE (1 << 22)
22#define ARCH_PERFMON_EVENTSEL_ANY (1 << 21) 22#define ARCH_PERFMON_EVENTSEL_ANY (1 << 21)
23#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) 23#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
24#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) 24#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
@@ -27,7 +27,14 @@
27/* 27/*
28 * Includes eventsel and unit mask as well: 28 * Includes eventsel and unit mask as well:
29 */ 29 */
30#define ARCH_PERFMON_EVENT_MASK 0xffff 30
31
32#define INTEL_ARCH_EVTSEL_MASK 0x000000FFULL
33#define INTEL_ARCH_UNIT_MASK 0x0000FF00ULL
34#define INTEL_ARCH_EDGE_MASK 0x00040000ULL
35#define INTEL_ARCH_INV_MASK 0x00800000ULL
36#define INTEL_ARCH_CNT_MASK 0xFF000000ULL
37#define INTEL_ARCH_EVENT_MASK (INTEL_ARCH_UNIT_MASK|INTEL_ARCH_EVTSEL_MASK)
31 38
32/* 39/*
33 * filter mask to validate fixed counter events. 40 * filter mask to validate fixed counter events.
@@ -38,7 +45,12 @@
38 * The other filters are supported by fixed counters. 45 * The other filters are supported by fixed counters.
39 * The any-thread option is supported starting with v3. 46 * The any-thread option is supported starting with v3.
40 */ 47 */
41#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000 48#define INTEL_ARCH_FIXED_MASK \
49 (INTEL_ARCH_CNT_MASK| \
50 INTEL_ARCH_INV_MASK| \
51 INTEL_ARCH_EDGE_MASK|\
52 INTEL_ARCH_UNIT_MASK|\
53 INTEL_ARCH_EVENT_MASK)
42 54
43#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c 55#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
44#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) 56#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
@@ -105,6 +117,18 @@ union cpuid10_edx {
105 */ 117 */
106#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) 118#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16)
107 119
120/* IbsFetchCtl bits/masks */
121#define IBS_FETCH_RAND_EN (1ULL<<57)
122#define IBS_FETCH_VAL (1ULL<<49)
123#define IBS_FETCH_ENABLE (1ULL<<48)
124#define IBS_FETCH_CNT 0xFFFF0000ULL
125#define IBS_FETCH_MAX_CNT 0x0000FFFFULL
126
127/* IbsOpCtl bits */
128#define IBS_OP_CNT_CTL (1ULL<<19)
129#define IBS_OP_VAL (1ULL<<18)
130#define IBS_OP_ENABLE (1ULL<<17)
131#define IBS_OP_MAX_CNT 0x0000FFFFULL
108 132
109#ifdef CONFIG_PERF_EVENTS 133#ifdef CONFIG_PERF_EVENTS
110extern void init_hw_perf_events(void); 134extern void init_hw_perf_events(void);
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 0e8c2a0fd922..271de94c3810 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -23,6 +23,11 @@ static inline void paravirt_release_pud(unsigned long pfn) {}
23#endif 23#endif
24 24
25/* 25/*
26 * Flags to use when allocating a user page table page.
27 */
28extern gfp_t __userpte_alloc_gfp;
29
30/*
26 * Allocate and free page tables. 31 * Allocate and free page tables.
27 */ 32 */
28extern pgd_t *pgd_alloc(struct mm_struct *); 33extern pgd_t *pgd_alloc(struct mm_struct *);
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 01fd9461d323..47339a1ac7b6 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -54,10 +54,10 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
54 in_irq() ? KM_IRQ_PTE : \ 54 in_irq() ? KM_IRQ_PTE : \
55 KM_PTE0) 55 KM_PTE0)
56#define pte_offset_map(dir, address) \ 56#define pte_offset_map(dir, address) \
57 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \ 57 ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \
58 pte_index((address))) 58 pte_index((address)))
59#define pte_offset_map_nested(dir, address) \ 59#define pte_offset_map_nested(dir, address) \
60 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ 60 ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \
61 pte_index((address))) 61 pte_index((address)))
62#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) 62#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
63#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) 63#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
@@ -80,7 +80,7 @@ do { \
80 * The i386 doesn't have any external MMU info: the kernel page 80 * The i386 doesn't have any external MMU info: the kernel page
81 * tables contain all the necessary information. 81 * tables contain all the necessary information.
82 */ 82 */
83#define update_mmu_cache(vma, address, pte) do { } while (0) 83#define update_mmu_cache(vma, address, ptep) do { } while (0)
84 84
85#endif /* !__ASSEMBLY__ */ 85#endif /* !__ASSEMBLY__ */
86 86
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index c57a30117149..181be528c612 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -129,7 +129,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
129#define pte_unmap(pte) /* NOP */ 129#define pte_unmap(pte) /* NOP */
130#define pte_unmap_nested(pte) /* NOP */ 130#define pte_unmap_nested(pte) /* NOP */
131 131
132#define update_mmu_cache(vma, address, pte) do { } while (0) 132#define update_mmu_cache(vma, address, ptep) do { } while (0)
133 133
134/* Encode and de-code a swap entry */ 134/* Encode and de-code a swap entry */
135#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE 135#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index fc801bab1b3b..b753ea59703a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -450,6 +450,8 @@ struct thread_struct {
450 struct perf_event *ptrace_bps[HBP_NUM]; 450 struct perf_event *ptrace_bps[HBP_NUM];
451 /* Debug status used for traps, single steps, etc... */ 451 /* Debug status used for traps, single steps, etc... */
452 unsigned long debugreg6; 452 unsigned long debugreg6;
453 /* Keep track of the exact dr7 value set by the user */
454 unsigned long ptrace_dr7;
453 /* Fault info: */ 455 /* Fault info: */
454 unsigned long cr2; 456 unsigned long cr2;
455 unsigned long trap_no; 457 unsigned long trap_no;
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 4009f6534f52..6f414ed88620 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -23,14 +23,4 @@ extern int reboot_force;
23 23
24long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); 24long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
25 25
26/*
27 * This looks more complex than it should be. But we need to
28 * get the type for the ~ right in round_down (it needs to be
29 * as wide as the result!), and we want to evaluate the macro
30 * arguments just once each.
31 */
32#define __round_mask(x,y) ((__typeof__(x))((y)-1))
33#define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1)
34#define round_down(x,y) ((x) & ~__round_mask(x,y))
35
36#endif /* _ASM_X86_PROTO_H */ 26#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 9d369f680321..69a686a7dff0 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -274,18 +274,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
274 return 0; 274 return 0;
275} 275}
276 276
277/* Get Nth argument at function call */
278extern unsigned long regs_get_argument_nth(struct pt_regs *regs,
279 unsigned int n);
280
281/*
282 * These are defined as per linux/ptrace.h, which see.
283 */
284#define arch_has_single_step() (1) 277#define arch_has_single_step() (1)
285extern void user_enable_single_step(struct task_struct *);
286extern void user_disable_single_step(struct task_struct *);
287
288extern void user_enable_block_step(struct task_struct *);
289#ifdef CONFIG_X86_DEBUGCTLMSR 278#ifdef CONFIG_X86_DEBUGCTLMSR
290#define arch_has_block_step() (1) 279#define arch_has_block_step() (1)
291#else 280#else
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index ca7517d33776..606ede126972 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -41,6 +41,7 @@
41#include <linux/list.h> 41#include <linux/list.h>
42#include <linux/spinlock.h> 42#include <linux/spinlock.h>
43#include <linux/lockdep.h> 43#include <linux/lockdep.h>
44#include <asm/asm.h>
44 45
45struct rwsem_waiter; 46struct rwsem_waiter;
46 47
@@ -55,17 +56,28 @@ extern asmregparm struct rw_semaphore *
55 56
56/* 57/*
57 * the semaphore definition 58 * the semaphore definition
59 *
60 * The bias values and the counter type limits the number of
61 * potential readers/writers to 32767 for 32 bits and 2147483647
62 * for 64 bits.
58 */ 63 */
59 64
60#define RWSEM_UNLOCKED_VALUE 0x00000000 65#ifdef CONFIG_X86_64
61#define RWSEM_ACTIVE_BIAS 0x00000001 66# define RWSEM_ACTIVE_MASK 0xffffffffL
62#define RWSEM_ACTIVE_MASK 0x0000ffff 67#else
63#define RWSEM_WAITING_BIAS (-0x00010000) 68# define RWSEM_ACTIVE_MASK 0x0000ffffL
69#endif
70
71#define RWSEM_UNLOCKED_VALUE 0x00000000L
72#define RWSEM_ACTIVE_BIAS 0x00000001L
73#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1)
64#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS 74#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
65#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) 75#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
66 76
77typedef signed long rwsem_count_t;
78
67struct rw_semaphore { 79struct rw_semaphore {
68 signed long count; 80 rwsem_count_t count;
69 spinlock_t wait_lock; 81 spinlock_t wait_lock;
70 struct list_head wait_list; 82 struct list_head wait_list;
71#ifdef CONFIG_DEBUG_LOCK_ALLOC 83#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -105,7 +117,7 @@ do { \
105static inline void __down_read(struct rw_semaphore *sem) 117static inline void __down_read(struct rw_semaphore *sem)
106{ 118{
107 asm volatile("# beginning down_read\n\t" 119 asm volatile("# beginning down_read\n\t"
108 LOCK_PREFIX " incl (%%eax)\n\t" 120 LOCK_PREFIX _ASM_INC "(%1)\n\t"
109 /* adds 0x00000001, returns the old value */ 121 /* adds 0x00000001, returns the old value */
110 " jns 1f\n" 122 " jns 1f\n"
111 " call call_rwsem_down_read_failed\n" 123 " call call_rwsem_down_read_failed\n"
@@ -121,14 +133,14 @@ static inline void __down_read(struct rw_semaphore *sem)
121 */ 133 */
122static inline int __down_read_trylock(struct rw_semaphore *sem) 134static inline int __down_read_trylock(struct rw_semaphore *sem)
123{ 135{
124 __s32 result, tmp; 136 rwsem_count_t result, tmp;
125 asm volatile("# beginning __down_read_trylock\n\t" 137 asm volatile("# beginning __down_read_trylock\n\t"
126 " movl %0,%1\n\t" 138 " mov %0,%1\n\t"
127 "1:\n\t" 139 "1:\n\t"
128 " movl %1,%2\n\t" 140 " mov %1,%2\n\t"
129 " addl %3,%2\n\t" 141 " add %3,%2\n\t"
130 " jle 2f\n\t" 142 " jle 2f\n\t"
131 LOCK_PREFIX " cmpxchgl %2,%0\n\t" 143 LOCK_PREFIX " cmpxchg %2,%0\n\t"
132 " jnz 1b\n\t" 144 " jnz 1b\n\t"
133 "2:\n\t" 145 "2:\n\t"
134 "# ending __down_read_trylock\n\t" 146 "# ending __down_read_trylock\n\t"
@@ -143,13 +155,13 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
143 */ 155 */
144static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) 156static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
145{ 157{
146 int tmp; 158 rwsem_count_t tmp;
147 159
148 tmp = RWSEM_ACTIVE_WRITE_BIAS; 160 tmp = RWSEM_ACTIVE_WRITE_BIAS;
149 asm volatile("# beginning down_write\n\t" 161 asm volatile("# beginning down_write\n\t"
150 LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" 162 LOCK_PREFIX " xadd %1,(%2)\n\t"
151 /* subtract 0x0000ffff, returns the old value */ 163 /* subtract 0x0000ffff, returns the old value */
152 " testl %%edx,%%edx\n\t" 164 " test %1,%1\n\t"
153 /* was the count 0 before? */ 165 /* was the count 0 before? */
154 " jz 1f\n" 166 " jz 1f\n"
155 " call call_rwsem_down_write_failed\n" 167 " call call_rwsem_down_write_failed\n"
@@ -170,9 +182,9 @@ static inline void __down_write(struct rw_semaphore *sem)
170 */ 182 */
171static inline int __down_write_trylock(struct rw_semaphore *sem) 183static inline int __down_write_trylock(struct rw_semaphore *sem)
172{ 184{
173 signed long ret = cmpxchg(&sem->count, 185 rwsem_count_t ret = cmpxchg(&sem->count,
174 RWSEM_UNLOCKED_VALUE, 186 RWSEM_UNLOCKED_VALUE,
175 RWSEM_ACTIVE_WRITE_BIAS); 187 RWSEM_ACTIVE_WRITE_BIAS);
176 if (ret == RWSEM_UNLOCKED_VALUE) 188 if (ret == RWSEM_UNLOCKED_VALUE)
177 return 1; 189 return 1;
178 return 0; 190 return 0;
@@ -183,9 +195,9 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
183 */ 195 */
184static inline void __up_read(struct rw_semaphore *sem) 196static inline void __up_read(struct rw_semaphore *sem)
185{ 197{
186 __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; 198 rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS;
187 asm volatile("# beginning __up_read\n\t" 199 asm volatile("# beginning __up_read\n\t"
188 LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" 200 LOCK_PREFIX " xadd %1,(%2)\n\t"
189 /* subtracts 1, returns the old value */ 201 /* subtracts 1, returns the old value */
190 " jns 1f\n\t" 202 " jns 1f\n\t"
191 " call call_rwsem_wake\n" 203 " call call_rwsem_wake\n"
@@ -201,18 +213,18 @@ static inline void __up_read(struct rw_semaphore *sem)
201 */ 213 */
202static inline void __up_write(struct rw_semaphore *sem) 214static inline void __up_write(struct rw_semaphore *sem)
203{ 215{
216 rwsem_count_t tmp;
204 asm volatile("# beginning __up_write\n\t" 217 asm volatile("# beginning __up_write\n\t"
205 " movl %2,%%edx\n\t" 218 LOCK_PREFIX " xadd %1,(%2)\n\t"
206 LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t"
207 /* tries to transition 219 /* tries to transition
208 0xffff0001 -> 0x00000000 */ 220 0xffff0001 -> 0x00000000 */
209 " jz 1f\n" 221 " jz 1f\n"
210 " call call_rwsem_wake\n" 222 " call call_rwsem_wake\n"
211 "1:\n\t" 223 "1:\n\t"
212 "# ending __up_write\n" 224 "# ending __up_write\n"
213 : "+m" (sem->count) 225 : "+m" (sem->count), "=d" (tmp)
214 : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS) 226 : "a" (sem), "1" (-RWSEM_ACTIVE_WRITE_BIAS)
215 : "memory", "cc", "edx"); 227 : "memory", "cc");
216} 228}
217 229
218/* 230/*
@@ -221,33 +233,38 @@ static inline void __up_write(struct rw_semaphore *sem)
221static inline void __downgrade_write(struct rw_semaphore *sem) 233static inline void __downgrade_write(struct rw_semaphore *sem)
222{ 234{
223 asm volatile("# beginning __downgrade_write\n\t" 235 asm volatile("# beginning __downgrade_write\n\t"
224 LOCK_PREFIX " addl %2,(%%eax)\n\t" 236 LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t"
225 /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ 237 /*
238 * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386)
239 * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64)
240 */
226 " jns 1f\n\t" 241 " jns 1f\n\t"
227 " call call_rwsem_downgrade_wake\n" 242 " call call_rwsem_downgrade_wake\n"
228 "1:\n\t" 243 "1:\n\t"
229 "# ending __downgrade_write\n" 244 "# ending __downgrade_write\n"
230 : "+m" (sem->count) 245 : "+m" (sem->count)
231 : "a" (sem), "i" (-RWSEM_WAITING_BIAS) 246 : "a" (sem), "er" (-RWSEM_WAITING_BIAS)
232 : "memory", "cc"); 247 : "memory", "cc");
233} 248}
234 249
235/* 250/*
236 * implement atomic add functionality 251 * implement atomic add functionality
237 */ 252 */
238static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) 253static inline void rwsem_atomic_add(rwsem_count_t delta,
254 struct rw_semaphore *sem)
239{ 255{
240 asm volatile(LOCK_PREFIX "addl %1,%0" 256 asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0"
241 : "+m" (sem->count) 257 : "+m" (sem->count)
242 : "ir" (delta)); 258 : "er" (delta));
243} 259}
244 260
245/* 261/*
246 * implement exchange and add functionality 262 * implement exchange and add functionality
247 */ 263 */
248static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) 264static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta,
265 struct rw_semaphore *sem)
249{ 266{
250 int tmp = delta; 267 rwsem_count_t tmp = delta;
251 268
252 asm volatile(LOCK_PREFIX "xadd %0,%1" 269 asm volatile(LOCK_PREFIX "xadd %0,%1"
253 : "+r" (tmp), "+m" (sem->count) 270 : "+r" (tmp), "+m" (sem->count)
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 18e496c98ff0..86b1506f4179 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -37,10 +37,8 @@ void setup_bios_corruption_check(void);
37 37
38#ifdef CONFIG_X86_VISWS 38#ifdef CONFIG_X86_VISWS
39extern void visws_early_detect(void); 39extern void visws_early_detect(void);
40extern int is_visws_box(void);
41#else 40#else
42static inline void visws_early_detect(void) { } 41static inline void visws_early_detect(void) { }
43static inline int is_visws_box(void) { return 0; }
44#endif 42#endif
45 43
46extern unsigned long saved_video_mode; 44extern unsigned long saved_video_mode;
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 1e796782cd7b..4cfc90824068 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -135,6 +135,8 @@ int native_cpu_disable(void);
135void native_cpu_die(unsigned int cpu); 135void native_cpu_die(unsigned int cpu);
136void native_play_dead(void); 136void native_play_dead(void);
137void play_dead_common(void); 137void play_dead_common(void);
138void wbinvd_on_cpu(int cpu);
139int wbinvd_on_all_cpus(void);
138 140
139void native_send_call_func_ipi(const struct cpumask *mask); 141void native_send_call_func_ipi(const struct cpumask *mask);
140void native_send_call_func_single_ipi(int cpu); 142void native_send_call_func_single_ipi(int cpu);
@@ -147,6 +149,13 @@ static inline int num_booting_cpus(void)
147{ 149{
148 return cpumask_weight(cpu_callout_mask); 150 return cpumask_weight(cpu_callout_mask);
149} 151}
152#else /* !CONFIG_SMP */
153#define wbinvd_on_cpu(cpu) wbinvd()
154static inline int wbinvd_on_all_cpus(void)
155{
156 wbinvd();
157 return 0;
158}
150#endif /* CONFIG_SMP */ 159#endif /* CONFIG_SMP */
151 160
152extern unsigned disabled_cpus __cpuinitdata; 161extern unsigned disabled_cpus __cpuinitdata;
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 35e89122a42f..4dab78edbad9 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -3,8 +3,6 @@
3 3
4extern int kstack_depth_to_print; 4extern int kstack_depth_to_print;
5 5
6int x86_is_stack_id(int id, char *name);
7
8struct thread_info; 6struct thread_info;
9struct stacktrace_ops; 7struct stacktrace_ops;
10 8
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 1fecb7e61130..38638cd2fa4c 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -313,7 +313,7 @@ struct __attribute__ ((__packed__)) vmcb {
313 313
314#define SVM_EXIT_ERR -1 314#define SVM_EXIT_ERR -1
315 315
316#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ 316#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
317 317
318#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" 318#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
319#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" 319#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index d5f69045c100..3ad421784ae7 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -26,8 +26,8 @@ asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *);
26asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *); 26asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
27asmlinkage long sys32_fstatat(unsigned int, char __user *, 27asmlinkage long sys32_fstatat(unsigned int, char __user *,
28 struct stat64 __user *, int); 28 struct stat64 __user *, int);
29struct mmap_arg_struct; 29struct mmap_arg_struct32;
30asmlinkage long sys32_mmap(struct mmap_arg_struct __user *); 30asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *);
31asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long); 31asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long);
32 32
33struct sigaction32; 33struct sigaction32;
@@ -40,8 +40,6 @@ asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *,
40 compat_sigset_t __user *, unsigned int); 40 compat_sigset_t __user *, unsigned int);
41asmlinkage long sys32_alarm(unsigned int); 41asmlinkage long sys32_alarm(unsigned int);
42 42
43struct sel_arg_struct;
44asmlinkage long sys32_old_select(struct sel_arg_struct __user *);
45asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int); 43asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int);
46asmlinkage long sys32_sysfs(int, u32, u32); 44asmlinkage long sys32_sysfs(int, u32, u32);
47 45
@@ -56,11 +54,6 @@ asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
56asmlinkage long sys32_personality(unsigned long); 54asmlinkage long sys32_personality(unsigned long);
57asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); 55asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
58 56
59struct oldold_utsname;
60struct old_utsname;
61asmlinkage long sys32_olduname(struct oldold_utsname __user *);
62long sys32_uname(struct old_utsname __user *);
63
64asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *, 57asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
65 compat_uptr_t __user *, struct pt_regs *); 58 compat_uptr_t __user *, struct pt_regs *);
66asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *); 59asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 8d33bc5462d1..c4a348f7bd43 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -16,6 +16,8 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/err.h> 17#include <linux/err.h>
18 18
19extern const unsigned long sys_call_table[];
20
19/* 21/*
20 * Only the low 32 bits of orig_ax are meaningful, so we return int. 22 * Only the low 32 bits of orig_ax are meaningful, so we return int.
21 * This importantly ignores the high bits on 64-bit, so comparisons 23 * This importantly ignores the high bits on 64-bit, so comparisons
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 8868b9420b0e..5c044b43e9a7 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -50,18 +50,6 @@ asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
50 struct old_sigaction __user *); 50 struct old_sigaction __user *);
51unsigned long sys_sigreturn(struct pt_regs *); 51unsigned long sys_sigreturn(struct pt_regs *);
52 52
53/* kernel/sys_i386_32.c */
54struct mmap_arg_struct;
55struct sel_arg_struct;
56struct oldold_utsname;
57struct old_utsname;
58
59asmlinkage int old_mmap(struct mmap_arg_struct __user *);
60asmlinkage int old_select(struct sel_arg_struct __user *);
61asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
62asmlinkage int sys_uname(struct old_utsname __user *);
63asmlinkage int sys_olduname(struct oldold_utsname __user *);
64
65/* kernel/vm86_32.c */ 53/* kernel/vm86_32.c */
66int sys_vm86old(struct vm86_struct __user *, struct pt_regs *); 54int sys_vm86old(struct vm86_struct __user *, struct pt_regs *);
67int sys_vm86(unsigned long, unsigned long, struct pt_regs *); 55int sys_vm86(unsigned long, unsigned long, struct pt_regs *);
@@ -73,11 +61,8 @@ int sys_vm86(unsigned long, unsigned long, struct pt_regs *);
73long sys_arch_prctl(int, unsigned long); 61long sys_arch_prctl(int, unsigned long);
74 62
75/* kernel/sys_x86_64.c */ 63/* kernel/sys_x86_64.c */
76struct new_utsname;
77
78asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, 64asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
79 unsigned long, unsigned long, unsigned long); 65 unsigned long, unsigned long, unsigned long);
80asmlinkage long sys_uname(struct new_utsname __user *);
81 66
82#endif /* CONFIG_X86_32 */ 67#endif /* CONFIG_X86_32 */
83#endif /* _ASM_X86_SYSCALLS_H */ 68#endif /* _ASM_X86_SYSCALLS_H */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index e04740f7a0bb..b8fe48ee2ed9 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -32,7 +32,7 @@ extern void show_regs_common(void);
32 "movl %P[task_canary](%[next]), %%ebx\n\t" \ 32 "movl %P[task_canary](%[next]), %%ebx\n\t" \
33 "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" 33 "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
34#define __switch_canary_oparam \ 34#define __switch_canary_oparam \
35 , [stack_canary] "=m" (per_cpu_var(stack_canary.canary)) 35 , [stack_canary] "=m" (stack_canary.canary)
36#define __switch_canary_iparam \ 36#define __switch_canary_iparam \
37 , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) 37 , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
38#else /* CC_STACKPROTECTOR */ 38#else /* CC_STACKPROTECTOR */
@@ -114,7 +114,7 @@ do { \
114 "movq %P[task_canary](%%rsi),%%r8\n\t" \ 114 "movq %P[task_canary](%%rsi),%%r8\n\t" \
115 "movq %%r8,"__percpu_arg([gs_canary])"\n\t" 115 "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
116#define __switch_canary_oparam \ 116#define __switch_canary_oparam \
117 , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary)) 117 , [gs_canary] "=m" (irq_stack_union.stack_canary)
118#define __switch_canary_iparam \ 118#define __switch_canary_iparam \
119 , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) 119 , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
120#else /* CC_STACKPROTECTOR */ 120#else /* CC_STACKPROTECTOR */
@@ -133,7 +133,7 @@ do { \
133 __switch_canary \ 133 __switch_canary \
134 "movq %P[thread_info](%%rsi),%%r8\n\t" \ 134 "movq %P[thread_info](%%rsi),%%r8\n\t" \
135 "movq %%rax,%%rdi\n\t" \ 135 "movq %%rax,%%rdi\n\t" \
136 "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ 136 "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
137 "jnz ret_from_fork\n\t" \ 137 "jnz ret_from_fork\n\t" \
138 RESTORE_CONTEXT \ 138 RESTORE_CONTEXT \
139 : "=a" (last) \ 139 : "=a" (last) \
@@ -143,7 +143,7 @@ do { \
143 [ti_flags] "i" (offsetof(struct thread_info, flags)), \ 143 [ti_flags] "i" (offsetof(struct thread_info, flags)), \
144 [_tif_fork] "i" (_TIF_FORK), \ 144 [_tif_fork] "i" (_TIF_FORK), \
145 [thread_info] "i" (offsetof(struct task_struct, stack)), \ 145 [thread_info] "i" (offsetof(struct task_struct, stack)), \
146 [current_task] "m" (per_cpu_var(current_task)) \ 146 [current_task] "m" (current_task) \
147 __switch_canary_iparam \ 147 __switch_canary_iparam \
148 : "memory", "cc" __EXTRA_CLOBBER) 148 : "memory", "cc" __EXTRA_CLOBBER)
149#endif 149#endif
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 535e421498f6..316708d5af92 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -8,6 +8,8 @@
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/prefetch.h> 9#include <linux/prefetch.h>
10#include <linux/lockdep.h> 10#include <linux/lockdep.h>
11#include <asm/alternative.h>
12#include <asm/cpufeature.h>
11#include <asm/page.h> 13#include <asm/page.h>
12 14
13/* 15/*
@@ -16,7 +18,24 @@
16 18
17/* Handles exceptions in both to and from, but doesn't do access_ok */ 19/* Handles exceptions in both to and from, but doesn't do access_ok */
18__must_check unsigned long 20__must_check unsigned long
19copy_user_generic(void *to, const void *from, unsigned len); 21copy_user_generic_string(void *to, const void *from, unsigned len);
22__must_check unsigned long
23copy_user_generic_unrolled(void *to, const void *from, unsigned len);
24
25static __always_inline __must_check unsigned long
26copy_user_generic(void *to, const void *from, unsigned len)
27{
28 unsigned ret;
29
30 alternative_call(copy_user_generic_unrolled,
31 copy_user_generic_string,
32 X86_FEATURE_REP_GOOD,
33 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
34 "=d" (len)),
35 "1" (to), "2" (from), "3" (len)
36 : "memory", "rcx", "r8", "r9", "r10", "r11");
37 return ret;
38}
20 39
21__must_check unsigned long 40__must_check unsigned long
22_copy_to_user(void __user *to, const void *from, unsigned len); 41_copy_to_user(void __user *to, const void *from, unsigned len);
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 3baf379fa840..beb9b5f8f8a4 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -354,6 +354,7 @@
354#define __ARCH_WANT_STAT64 354#define __ARCH_WANT_STAT64
355#define __ARCH_WANT_SYS_ALARM 355#define __ARCH_WANT_SYS_ALARM
356#define __ARCH_WANT_SYS_GETHOSTNAME 356#define __ARCH_WANT_SYS_GETHOSTNAME
357#define __ARCH_WANT_SYS_IPC
357#define __ARCH_WANT_SYS_PAUSE 358#define __ARCH_WANT_SYS_PAUSE
358#define __ARCH_WANT_SYS_SGETMASK 359#define __ARCH_WANT_SYS_SGETMASK
359#define __ARCH_WANT_SYS_SIGNAL 360#define __ARCH_WANT_SYS_SIGNAL
@@ -366,6 +367,9 @@
366#define __ARCH_WANT_SYS_LLSEEK 367#define __ARCH_WANT_SYS_LLSEEK
367#define __ARCH_WANT_SYS_NICE 368#define __ARCH_WANT_SYS_NICE
368#define __ARCH_WANT_SYS_OLD_GETRLIMIT 369#define __ARCH_WANT_SYS_OLD_GETRLIMIT
370#define __ARCH_WANT_SYS_OLD_UNAME
371#define __ARCH_WANT_SYS_OLD_MMAP
372#define __ARCH_WANT_SYS_OLD_SELECT
369#define __ARCH_WANT_SYS_OLDUMOUNT 373#define __ARCH_WANT_SYS_OLDUMOUNT
370#define __ARCH_WANT_SYS_SIGPENDING 374#define __ARCH_WANT_SYS_SIGPENDING
371#define __ARCH_WANT_SYS_SIGPROCMASK 375#define __ARCH_WANT_SYS_SIGPROCMASK
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 4843f7ba754a..ff4307b0e81e 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -146,7 +146,7 @@ __SYSCALL(__NR_wait4, sys_wait4)
146#define __NR_kill 62 146#define __NR_kill 62
147__SYSCALL(__NR_kill, sys_kill) 147__SYSCALL(__NR_kill, sys_kill)
148#define __NR_uname 63 148#define __NR_uname 63
149__SYSCALL(__NR_uname, sys_uname) 149__SYSCALL(__NR_uname, sys_newuname)
150 150
151#define __NR_semget 64 151#define __NR_semget 64
152__SYSCALL(__NR_semget, sys_semget) 152__SYSCALL(__NR_semget, sys_semget)
@@ -680,6 +680,7 @@ __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
680#define __ARCH_WANT_SYS_LLSEEK 680#define __ARCH_WANT_SYS_LLSEEK
681#define __ARCH_WANT_SYS_NICE 681#define __ARCH_WANT_SYS_NICE
682#define __ARCH_WANT_SYS_OLD_GETRLIMIT 682#define __ARCH_WANT_SYS_OLD_GETRLIMIT
683#define __ARCH_WANT_SYS_OLD_UNAME
683#define __ARCH_WANT_SYS_OLDUMOUNT 684#define __ARCH_WANT_SYS_OLDUMOUNT
684#define __ARCH_WANT_SYS_SIGPENDING 685#define __ARCH_WANT_SYS_SIGPENDING
685#define __ARCH_WANT_SYS_SIGPROCMASK 686#define __ARCH_WANT_SYS_SIGPROCMASK
diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h
index 999873b22e7f..24532c7da3d6 100644
--- a/arch/x86/include/asm/user.h
+++ b/arch/x86/include/asm/user.h
@@ -1,5 +1,63 @@
1#ifndef _ASM_X86_USER_H
2#define _ASM_X86_USER_H
3
1#ifdef CONFIG_X86_32 4#ifdef CONFIG_X86_32
2# include "user_32.h" 5# include "user_32.h"
3#else 6#else
4# include "user_64.h" 7# include "user_64.h"
5#endif 8#endif
9
10#include <asm/types.h>
11
12struct user_ymmh_regs {
13 /* 16 * 16 bytes for each YMMH-reg */
14 __u32 ymmh_space[64];
15};
16
17struct user_xsave_hdr {
18 __u64 xstate_bv;
19 __u64 reserved1[2];
20 __u64 reserved2[5];
21};
22
23/*
24 * The structure layout of user_xstateregs, used for exporting the
25 * extended register state through ptrace and core-dump (NT_X86_XSTATE note)
26 * interfaces will be same as the memory layout of xsave used by the processor
27 * (except for the bytes 464..511, which can be used by the software) and hence
28 * the size of this structure varies depending on the features supported by the
29 * processor and OS. The size of the structure that users need to use can be
30 * obtained by doing:
31 * cpuid_count(0xd, 0, &eax, &ptrace_xstateregs_struct_size, &ecx, &edx);
32 * i.e., cpuid.(eax=0xd,ecx=0).ebx will be the size that user (debuggers, etc.)
33 * need to use.
34 *
35 * For now, only the first 8 bytes of the software usable bytes[464..471] will
36 * be used and will be set to OS enabled xstate mask (which is same as the
37 * 64bit mask returned by the xgetbv's xCR0). Users (analyzing core dump
38 * remotely, etc.) can use this mask as well as the mask saved in the
39 * xstate_hdr bytes and interpret what states the processor/OS supports
40 * and what states are in modified/initialized conditions for the
41 * particular process/thread.
42 *
43 * Also when the user modifies certain state FP/SSE/etc through the
44 * ptrace interface, they must ensure that the xsave_hdr.xstate_bv
45 * bytes[512..519] of the memory layout are updated correspondingly.
46 * i.e., for example when FP state is modified to a non-init state,
47 * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to
48 * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc.
49 */
50#define USER_XSTATE_FX_SW_WORDS 6
51#define USER_XSTATE_XCR0_WORD 0
52
53struct user_xstateregs {
54 struct {
55 __u64 fpx_space[58];
56 __u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS];
57 } i387;
58 struct user_xsave_hdr xsave_hdr;
59 struct user_ymmh_regs ymmh;
60 /* further processor state extensions go here */
61};
62
63#endif /* _ASM_X86_USER_H */
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 2751f3075d8b..71605c7d5c5c 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -18,8 +18,8 @@
18 * along with this program; if not, write to the Free Software 18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * 20 *
21 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. 21 * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
22 * Copyright (c) Russ Anderson 22 * Copyright (c) Russ Anderson <rja@sgi.com>
23 */ 23 */
24 24
25#include <linux/rtc.h> 25#include <linux/rtc.h>
@@ -36,7 +36,8 @@ enum uv_bios_cmd {
36 UV_BIOS_WATCHLIST_ALLOC, 36 UV_BIOS_WATCHLIST_ALLOC,
37 UV_BIOS_WATCHLIST_FREE, 37 UV_BIOS_WATCHLIST_FREE,
38 UV_BIOS_MEMPROTECT, 38 UV_BIOS_MEMPROTECT,
39 UV_BIOS_GET_PARTITION_ADDR 39 UV_BIOS_GET_PARTITION_ADDR,
40 UV_BIOS_SET_LEGACY_VGA_TARGET
40}; 41};
41 42
42/* 43/*
@@ -89,13 +90,14 @@ extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64);
89extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); 90extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64);
90extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); 91extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
91 92
92extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); 93extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *);
93extern s64 uv_bios_freq_base(u64, u64 *); 94extern s64 uv_bios_freq_base(u64, u64 *);
94extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int, 95extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int,
95 unsigned long *); 96 unsigned long *);
96extern int uv_bios_mq_watchlist_free(int, int); 97extern int uv_bios_mq_watchlist_free(int, int);
97extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); 98extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
98extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); 99extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *);
100extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus);
99 101
100extern void uv_bios_init(void); 102extern void uv_bios_init(void);
101 103
@@ -104,6 +106,7 @@ extern int uv_type;
104extern long sn_partition_id; 106extern long sn_partition_id;
105extern long sn_coherency_id; 107extern long sn_coherency_id;
106extern long sn_region_size; 108extern long sn_region_size;
109extern long system_serial_number;
107#define partition_coherence_id() (sn_coherency_id) 110#define partition_coherence_id() (sn_coherency_id)
108 111
109extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ 112extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index c0a01b5d985b..3bb9491b7659 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -11,6 +11,7 @@ struct mm_struct;
11extern enum uv_system_type get_uv_system_type(void); 11extern enum uv_system_type get_uv_system_type(void);
12extern int is_uv_system(void); 12extern int is_uv_system(void);
13extern void uv_cpu_init(void); 13extern void uv_cpu_init(void);
14extern void uv_nmi_init(void);
14extern void uv_system_init(void); 15extern void uv_system_init(void);
15extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, 16extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
16 struct mm_struct *mm, 17 struct mm_struct *mm,
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 40be813fefb1..14cc74ba5d23 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -329,7 +329,8 @@ static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset
329 */ 329 */
330static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset) 330static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset)
331{ 331{
332 return UV_GLOBAL_GRU_MMR_BASE | offset | (pnode << uv_hub_info->m_val); 332 return UV_GLOBAL_GRU_MMR_BASE | offset |
333 ((unsigned long)pnode << uv_hub_info->m_val);
333} 334}
334 335
335static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val) 336static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val)
diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h
index 166adf61e770..2edb37637ead 100644
--- a/arch/x86/include/asm/visws/cobalt.h
+++ b/arch/x86/include/asm/visws/cobalt.h
@@ -122,4 +122,6 @@ extern char visws_board_type;
122 122
123extern char visws_board_rev; 123extern char visws_board_rev;
124 124
125extern int pci_visws_init(void);
126
125#endif /* _ASM_X86_VISWS_COBALT_H */ 127#endif /* _ASM_X86_VISWS_COBALT_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2b4945419a84..fb9a080740ec 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -53,6 +53,7 @@
53 */ 53 */
54#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 54#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
55#define SECONDARY_EXEC_ENABLE_EPT 0x00000002 55#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
56#define SECONDARY_EXEC_RDTSCP 0x00000008
56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 57#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 58#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 59#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
@@ -251,6 +252,7 @@ enum vmcs_field {
251#define EXIT_REASON_MSR_READ 31 252#define EXIT_REASON_MSR_READ 31
252#define EXIT_REASON_MSR_WRITE 32 253#define EXIT_REASON_MSR_WRITE 32
253#define EXIT_REASON_MWAIT_INSTRUCTION 36 254#define EXIT_REASON_MWAIT_INSTRUCTION 36
255#define EXIT_REASON_MONITOR_INSTRUCTION 39
254#define EXIT_REASON_PAUSE_INSTRUCTION 40 256#define EXIT_REASON_PAUSE_INSTRUCTION 40
255#define EXIT_REASON_MCE_DURING_VMENTRY 41 257#define EXIT_REASON_MCE_DURING_VMENTRY 41
256#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 258#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
@@ -362,6 +364,7 @@ enum vmcs_field {
362#define VMX_EPTP_UC_BIT (1ull << 8) 364#define VMX_EPTP_UC_BIT (1ull << 8)
363#define VMX_EPTP_WB_BIT (1ull << 14) 365#define VMX_EPTP_WB_BIT (1ull << 14)
364#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 366#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
367#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
365#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) 368#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
366#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 369#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
367#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 370#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
@@ -374,7 +377,7 @@ enum vmcs_field {
374#define VMX_EPT_READABLE_MASK 0x1ull 377#define VMX_EPT_READABLE_MASK 0x1ull
375#define VMX_EPT_WRITABLE_MASK 0x2ull 378#define VMX_EPT_WRITABLE_MASK 0x2ull
376#define VMX_EPT_EXECUTABLE_MASK 0x4ull 379#define VMX_EPT_EXECUTABLE_MASK 0x4ull
377#define VMX_EPT_IGMT_BIT (1ull << 6) 380#define VMX_EPT_IPAT_BIT (1ull << 6)
378 381
379#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 382#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
380 383
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index ea0e8ea15e15..519b54327d75 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -99,6 +99,20 @@ struct x86_init_iommu {
99}; 99};
100 100
101/** 101/**
102 * struct x86_init_pci - platform specific pci init functions
103 * @arch_init: platform specific pci arch init call
104 * @init: platform specific pci subsystem init
105 * @init_irq: platform specific pci irq init
106 * @fixup_irqs: platform specific pci irq fixup
107 */
108struct x86_init_pci {
109 int (*arch_init)(void);
110 int (*init)(void);
111 void (*init_irq)(void);
112 void (*fixup_irqs)(void);
113};
114
115/**
102 * struct x86_init_ops - functions for platform specific setup 116 * struct x86_init_ops - functions for platform specific setup
103 * 117 *
104 */ 118 */
@@ -110,6 +124,7 @@ struct x86_init_ops {
110 struct x86_init_paging paging; 124 struct x86_init_paging paging;
111 struct x86_init_timers timers; 125 struct x86_init_timers timers;
112 struct x86_init_iommu iommu; 126 struct x86_init_iommu iommu;
127 struct x86_init_pci pci;
113}; 128};
114 129
115/** 130/**
@@ -126,6 +141,7 @@ struct x86_cpuinit_ops {
126 * @get_wallclock: get time from HW clock like RTC etc. 141 * @get_wallclock: get time from HW clock like RTC etc.
127 * @set_wallclock: set time back to HW clock 142 * @set_wallclock: set time back to HW clock
128 * @is_untracked_pat_range exclude from PAT logic 143 * @is_untracked_pat_range exclude from PAT logic
144 * @nmi_init enable NMI on cpus
129 */ 145 */
130struct x86_platform_ops { 146struct x86_platform_ops {
131 unsigned long (*calibrate_tsc)(void); 147 unsigned long (*calibrate_tsc)(void);
@@ -133,6 +149,7 @@ struct x86_platform_ops {
133 int (*set_wallclock)(unsigned long nowtime); 149 int (*set_wallclock)(unsigned long nowtime);
134 void (*iommu_shutdown)(void); 150 void (*iommu_shutdown)(void);
135 bool (*is_untracked_pat_range)(u64 start, u64 end); 151 bool (*is_untracked_pat_range)(u64 start, u64 end);
152 void (*nmi_init)(void);
136}; 153};
137 154
138extern struct x86_init_ops x86_init; 155extern struct x86_init_ops x86_init;
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 727acc152344..ddc04ccad03b 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -27,9 +27,11 @@
27extern unsigned int xstate_size; 27extern unsigned int xstate_size;
28extern u64 pcntxt_mask; 28extern u64 pcntxt_mask;
29extern struct xsave_struct *init_xstate_buf; 29extern struct xsave_struct *init_xstate_buf;
30extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
30 31
31extern void xsave_cntxt_init(void); 32extern void xsave_cntxt_init(void);
32extern void xsave_init(void); 33extern void xsave_init(void);
34extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
33extern int init_fpu(struct task_struct *child); 35extern int init_fpu(struct task_struct *child);
34extern int check_for_xstate(struct i387_fxsave_struct __user *buf, 36extern int check_for_xstate(struct i387_fxsave_struct __user *buf,
35 void __user *fpstate, 37 void __user *fpstate,
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d87f09bc5a52..4c58352209e0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -87,6 +87,7 @@ obj-$(CONFIG_VM86) += vm86_32.o
87obj-$(CONFIG_EARLY_PRINTK) += early_printk.o 87obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
88 88
89obj-$(CONFIG_HPET_TIMER) += hpet.o 89obj-$(CONFIG_HPET_TIMER) += hpet.o
90obj-$(CONFIG_APB_TIMER) += apb_timer.o
90 91
91obj-$(CONFIG_K8_NB) += k8.o 92obj-$(CONFIG_K8_NB) += k8.o
92obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 93obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 5c96b75c6ea8..a54d714545ff 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -35,6 +35,7 @@
35#include <linux/ioport.h> 35#include <linux/ioport.h>
36#include <linux/pci.h> 36#include <linux/pci.h>
37 37
38#include <asm/pci_x86.h>
38#include <asm/pgtable.h> 39#include <asm/pgtable.h>
39#include <asm/io_apic.h> 40#include <asm/io_apic.h>
40#include <asm/apic.h> 41#include <asm/apic.h>
@@ -49,6 +50,7 @@ EXPORT_SYMBOL(acpi_disabled);
49 50
50#ifdef CONFIG_X86_64 51#ifdef CONFIG_X86_64
51# include <asm/proto.h> 52# include <asm/proto.h>
53# include <asm/numa_64.h>
52#endif /* X86 */ 54#endif /* X86 */
53 55
54#define BAD_MADT_ENTRY(entry, end) ( \ 56#define BAD_MADT_ENTRY(entry, end) ( \
@@ -489,6 +491,25 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
489 */ 491 */
490#ifdef CONFIG_ACPI_HOTPLUG_CPU 492#ifdef CONFIG_ACPI_HOTPLUG_CPU
491 493
494static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
495{
496#ifdef CONFIG_ACPI_NUMA
497 int nid;
498
499 nid = acpi_get_node(handle);
500 if (nid == -1 || !node_online(nid))
501 return;
502#ifdef CONFIG_X86_64
503 apicid_to_node[physid] = nid;
504 numa_set_node(cpu, nid);
505#else /* CONFIG_X86_32 */
506 apicid_2_node[physid] = nid;
507 cpu_to_node_map[cpu] = nid;
508#endif
509
510#endif
511}
512
492static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) 513static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
493{ 514{
494 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; 515 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
@@ -547,6 +568,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
547 } 568 }
548 569
549 cpu = cpumask_first(new_map); 570 cpu = cpumask_first(new_map);
571 acpi_map_cpu2node(handle, cpu, physid);
550 572
551 *pcpu = cpu; 573 *pcpu = cpu;
552 retval = 0; 574 retval = 0;
@@ -1351,14 +1373,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1351 }, 1373 },
1352 { 1374 {
1353 .callback = force_acpi_ht, 1375 .callback = force_acpi_ht,
1354 .ident = "ASUS P2B-DS",
1355 .matches = {
1356 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1357 DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
1358 },
1359 },
1360 {
1361 .callback = force_acpi_ht,
1362 .ident = "ASUS CUR-DLS", 1376 .ident = "ASUS CUR-DLS",
1363 .matches = { 1377 .matches = {
1364 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), 1378 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
@@ -1611,6 +1625,9 @@ int __init acpi_boot_init(void)
1611 1625
1612 acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); 1626 acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
1613 1627
1628 if (!acpi_noirq)
1629 x86_init.pci.init = pci_acpi_init;
1630
1614 return 0; 1631 return 0;
1615} 1632}
1616 1633
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index de7353c0ce9c..3a4bf35c179b 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -7,6 +7,7 @@
7#include <linux/mm.h> 7#include <linux/mm.h>
8#include <linux/vmalloc.h> 8#include <linux/vmalloc.h>
9#include <linux/memory.h> 9#include <linux/memory.h>
10#include <linux/stop_machine.h>
10#include <asm/alternative.h> 11#include <asm/alternative.h>
11#include <asm/sections.h> 12#include <asm/sections.h>
12#include <asm/pgtable.h> 13#include <asm/pgtable.h>
@@ -205,7 +206,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
205 struct alt_instr *end) 206 struct alt_instr *end)
206{ 207{
207 struct alt_instr *a; 208 struct alt_instr *a;
208 char insnbuf[MAX_PATCH_LEN]; 209 u8 insnbuf[MAX_PATCH_LEN];
209 210
210 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 211 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
211 for (a = start; a < end; a++) { 212 for (a = start; a < end; a++) {
@@ -223,6 +224,8 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
223 } 224 }
224#endif 225#endif
225 memcpy(insnbuf, a->replacement, a->replacementlen); 226 memcpy(insnbuf, a->replacement, a->replacementlen);
227 if (*insnbuf == 0xe8 && a->replacementlen == 5)
228 *(s32 *)(insnbuf + 1) += a->replacement - a->instr;
226 add_nops(insnbuf + a->replacementlen, 229 add_nops(insnbuf + a->replacementlen,
227 a->instrlen - a->replacementlen); 230 a->instrlen - a->replacementlen);
228 text_poke_early(instr, insnbuf, a->instrlen); 231 text_poke_early(instr, insnbuf, a->instrlen);
@@ -390,6 +393,24 @@ void alternatives_smp_switch(int smp)
390 mutex_unlock(&smp_alt); 393 mutex_unlock(&smp_alt);
391} 394}
392 395
396/* Return 1 if the address range is reserved for smp-alternatives */
397int alternatives_text_reserved(void *start, void *end)
398{
399 struct smp_alt_module *mod;
400 u8 **ptr;
401 u8 *text_start = start;
402 u8 *text_end = end;
403
404 list_for_each_entry(mod, &smp_alt_modules, next) {
405 if (mod->text > text_end || mod->text_end < text_start)
406 continue;
407 for (ptr = mod->locks; ptr < mod->locks_end; ptr++)
408 if (text_start <= *ptr && text_end >= *ptr)
409 return 1;
410 }
411
412 return 0;
413}
393#endif 414#endif
394 415
395#ifdef CONFIG_PARAVIRT 416#ifdef CONFIG_PARAVIRT
@@ -552,3 +573,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
552 local_irq_restore(flags); 573 local_irq_restore(flags);
553 return addr; 574 return addr;
554} 575}
576
577/*
578 * Cross-modifying kernel text with stop_machine().
579 * This code originally comes from immediate value.
580 */
581static atomic_t stop_machine_first;
582static int wrote_text;
583
584struct text_poke_params {
585 void *addr;
586 const void *opcode;
587 size_t len;
588};
589
590static int __kprobes stop_machine_text_poke(void *data)
591{
592 struct text_poke_params *tpp = data;
593
594 if (atomic_dec_and_test(&stop_machine_first)) {
595 text_poke(tpp->addr, tpp->opcode, tpp->len);
596 smp_wmb(); /* Make sure other cpus see that this has run */
597 wrote_text = 1;
598 } else {
599 while (!wrote_text)
600 cpu_relax();
601 smp_mb(); /* Load wrote_text before following execution */
602 }
603
604 flush_icache_range((unsigned long)tpp->addr,
605 (unsigned long)tpp->addr + tpp->len);
606 return 0;
607}
608
609/**
610 * text_poke_smp - Update instructions on a live kernel on SMP
611 * @addr: address to modify
612 * @opcode: source of the copy
613 * @len: length to copy
614 *
615 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
616 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
617 * should be allowed, since stop_machine() does _not_ protect code against
618 * NMI and MCE.
619 *
620 * Note: Must be called under get_online_cpus() and text_mutex.
621 */
622void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
623{
624 struct text_poke_params tpp;
625
626 tpp.addr = addr;
627 tpp.opcode = opcode;
628 tpp.len = len;
629 atomic_set(&stop_machine_first, 1);
630 wrote_text = 0;
631 stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
632 return addr;
633}
634
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
new file mode 100644
index 000000000000..4b7099526d2c
--- /dev/null
+++ b/arch/x86/kernel/apb_timer.c
@@ -0,0 +1,784 @@
1/*
2 * apb_timer.c: Driver for Langwell APB timers
3 *
4 * (C) Copyright 2009 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 *
12 * Note:
13 * Langwell is the south complex of Intel Moorestown MID platform. There are
14 * eight external timers in total that can be used by the operating system.
15 * The timer information, such as frequency and addresses, is provided to the
16 * OS via SFI tables.
17 * Timer interrupts are routed via FW/HW emulated IOAPIC independently via
18 * individual redirection table entries (RTE).
19 * Unlike HPET, there is no master counter, therefore one of the timers are
20 * used as clocksource. The overall allocation looks like:
21 * - timer 0 - NR_CPUs for per cpu timer
22 * - one timer for clocksource
23 * - one timer for watchdog driver.
24 * It is also worth notice that APB timer does not support true one-shot mode,
25 * free-running mode will be used here to emulate one-shot mode.
26 * APB timer can also be used as broadcast timer along with per cpu local APIC
27 * timer, but by default APB timer has higher rating than local APIC timers.
28 */
29
30#include <linux/clocksource.h>
31#include <linux/clockchips.h>
32#include <linux/delay.h>
33#include <linux/errno.h>
34#include <linux/init.h>
35#include <linux/sysdev.h>
36#include <linux/pm.h>
37#include <linux/pci.h>
38#include <linux/sfi.h>
39#include <linux/interrupt.h>
40#include <linux/cpu.h>
41#include <linux/irq.h>
42
43#include <asm/fixmap.h>
44#include <asm/apb_timer.h>
45
46#define APBT_MASK CLOCKSOURCE_MASK(32)
47#define APBT_SHIFT 22
48#define APBT_CLOCKEVENT_RATING 150
49#define APBT_CLOCKSOURCE_RATING 250
50#define APBT_MIN_DELTA_USEC 200
51
52#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
53#define APBT_CLOCKEVENT0_NUM (0)
54#define APBT_CLOCKEVENT1_NUM (1)
55#define APBT_CLOCKSOURCE_NUM (2)
56
57static unsigned long apbt_address;
58static int apb_timer_block_enabled;
59static void __iomem *apbt_virt_address;
60static int phy_cs_timer_id;
61
62/*
63 * Common DW APB timer info
64 */
65static uint64_t apbt_freq;
66
67static void apbt_set_mode(enum clock_event_mode mode,
68 struct clock_event_device *evt);
69static int apbt_next_event(unsigned long delta,
70 struct clock_event_device *evt);
71static cycle_t apbt_read_clocksource(struct clocksource *cs);
72static void apbt_restart_clocksource(struct clocksource *cs);
73
74struct apbt_dev {
75 struct clock_event_device evt;
76 unsigned int num;
77 int cpu;
78 unsigned int irq;
79 unsigned int tick;
80 unsigned int count;
81 unsigned int flags;
82 char name[10];
83};
84
85int disable_apbt_percpu __cpuinitdata;
86
87static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
88
89#ifdef CONFIG_SMP
90static unsigned int apbt_num_timers_used;
91static struct apbt_dev *apbt_devs;
92#endif
93
94static inline unsigned long apbt_readl_reg(unsigned long a)
95{
96 return readl(apbt_virt_address + a);
97}
98
99static inline void apbt_writel_reg(unsigned long d, unsigned long a)
100{
101 writel(d, apbt_virt_address + a);
102}
103
104static inline unsigned long apbt_readl(int n, unsigned long a)
105{
106 return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
107}
108
109static inline void apbt_writel(int n, unsigned long d, unsigned long a)
110{
111 writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE);
112}
113
114static inline void apbt_set_mapping(void)
115{
116 struct sfi_timer_table_entry *mtmr;
117
118 if (apbt_virt_address) {
119 pr_debug("APBT base already mapped\n");
120 return;
121 }
122 mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
123 if (mtmr == NULL) {
124 printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
125 APBT_CLOCKEVENT0_NUM);
126 return;
127 }
128 apbt_address = (unsigned long)mtmr->phys_addr;
129 if (!apbt_address) {
130 printk(KERN_WARNING "No timer base from SFI, use default\n");
131 apbt_address = APBT_DEFAULT_BASE;
132 }
133 apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
134 if (apbt_virt_address) {
135 pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\
136 (void *)apbt_address, (void *)apbt_virt_address);
137 } else {
138 pr_debug("Failed mapping APBT phy address at %p\n",\
139 (void *)apbt_address);
140 goto panic_noapbt;
141 }
142 apbt_freq = mtmr->freq_hz / USEC_PER_SEC;
143 sfi_free_mtmr(mtmr);
144
145 /* Now figure out the physical timer id for clocksource device */
146 mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM);
147 if (mtmr == NULL)
148 goto panic_noapbt;
149
150 /* Now figure out the physical timer id */
151 phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff)
152 / APBTMRS_REG_SIZE;
153 pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id);
154 return;
155
156panic_noapbt:
157 panic("Failed to setup APB system timer\n");
158
159}
160
161static inline void apbt_clear_mapping(void)
162{
163 iounmap(apbt_virt_address);
164 apbt_virt_address = NULL;
165}
166
167/*
168 * APBT timer interrupt enable / disable
169 */
170static inline int is_apbt_capable(void)
171{
172 return apbt_virt_address ? 1 : 0;
173}
174
175static struct clocksource clocksource_apbt = {
176 .name = "apbt",
177 .rating = APBT_CLOCKSOURCE_RATING,
178 .read = apbt_read_clocksource,
179 .mask = APBT_MASK,
180 .shift = APBT_SHIFT,
181 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
182 .resume = apbt_restart_clocksource,
183};
184
185/* boot APB clock event device */
186static struct clock_event_device apbt_clockevent = {
187 .name = "apbt0",
188 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
189 .set_mode = apbt_set_mode,
190 .set_next_event = apbt_next_event,
191 .shift = APBT_SHIFT,
192 .irq = 0,
193 .rating = APBT_CLOCKEVENT_RATING,
194};
195
196/*
197 * if user does not want to use per CPU apb timer, just give it a lower rating
198 * than local apic timer and skip the late per cpu timer init.
199 */
200static inline int __init setup_x86_mrst_timer(char *arg)
201{
202 if (!arg)
203 return -EINVAL;
204
205 if (strcmp("apbt_only", arg) == 0)
206 disable_apbt_percpu = 0;
207 else if (strcmp("lapic_and_apbt", arg) == 0)
208 disable_apbt_percpu = 1;
209 else {
210 pr_warning("X86 MRST timer option %s not recognised"
211 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
212 arg);
213 return -EINVAL;
214 }
215 return 0;
216}
217__setup("x86_mrst_timer=", setup_x86_mrst_timer);
218
219/*
220 * start count down from 0xffff_ffff. this is done by toggling the enable bit
221 * then load initial load count to ~0.
222 */
223static void apbt_start_counter(int n)
224{
225 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
226
227 ctrl &= ~APBTMR_CONTROL_ENABLE;
228 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
229 apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
230 /* enable, mask interrupt */
231 ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
232 ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
233 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
234 /* read it once to get cached counter value initialized */
235 apbt_read_clocksource(&clocksource_apbt);
236}
237
238static irqreturn_t apbt_interrupt_handler(int irq, void *data)
239{
240 struct apbt_dev *dev = (struct apbt_dev *)data;
241 struct clock_event_device *aevt = &dev->evt;
242
243 if (!aevt->event_handler) {
244 printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
245 dev->num);
246 return IRQ_NONE;
247 }
248 aevt->event_handler(aevt);
249 return IRQ_HANDLED;
250}
251
252static void apbt_restart_clocksource(struct clocksource *cs)
253{
254 apbt_start_counter(phy_cs_timer_id);
255}
256
257/* Setup IRQ routing via IOAPIC */
258#ifdef CONFIG_SMP
259static void apbt_setup_irq(struct apbt_dev *adev)
260{
261 struct irq_chip *chip;
262 struct irq_desc *desc;
263
264 /* timer0 irq has been setup early */
265 if (adev->irq == 0)
266 return;
267 desc = irq_to_desc(adev->irq);
268 chip = get_irq_chip(adev->irq);
269 disable_irq(adev->irq);
270 desc->status |= IRQ_MOVE_PCNTXT;
271 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
272 /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
273 set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
274 enable_irq(adev->irq);
275 if (system_state == SYSTEM_BOOTING)
276 if (request_irq(adev->irq, apbt_interrupt_handler,
277 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
278 adev->name, adev)) {
279 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
280 adev->num);
281 }
282}
283#endif
284
285static void apbt_enable_int(int n)
286{
287 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
288 /* clear pending intr */
289 apbt_readl(n, APBTMR_N_EOI);
290 ctrl &= ~APBTMR_CONTROL_INT;
291 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
292}
293
294static void apbt_disable_int(int n)
295{
296 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
297
298 ctrl |= APBTMR_CONTROL_INT;
299 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
300}
301
302
303static int __init apbt_clockevent_register(void)
304{
305 struct sfi_timer_table_entry *mtmr;
306 struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev);
307
308 mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
309 if (mtmr == NULL) {
310 printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
311 APBT_CLOCKEVENT0_NUM);
312 return -ENODEV;
313 }
314
315 /*
316 * We need to calculate the scaled math multiplication factor for
317 * nanosecond to apbt tick conversion.
318 * mult = (nsec/cycle)*2^APBT_SHIFT
319 */
320 apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
321 , NSEC_PER_SEC, APBT_SHIFT);
322
323 /* Calculate the min / max delta */
324 apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
325 &apbt_clockevent);
326 apbt_clockevent.min_delta_ns = clockevent_delta2ns(
327 APBT_MIN_DELTA_USEC*apbt_freq,
328 &apbt_clockevent);
329 /*
330 * Start apbt with the boot cpu mask and make it
331 * global if not used for per cpu timer.
332 */
333 apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
334 adev->num = smp_processor_id();
335 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
336
337 if (disable_apbt_percpu) {
338 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
339 global_clock_event = &adev->evt;
340 printk(KERN_DEBUG "%s clockevent registered as global\n",
341 global_clock_event->name);
342 }
343
344 if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler,
345 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
346 apbt_clockevent.name, adev)) {
347 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
348 apbt_clockevent.irq);
349 }
350
351 clockevents_register_device(&adev->evt);
352 /* Start APBT 0 interrupts */
353 apbt_enable_int(APBT_CLOCKEVENT0_NUM);
354
355 sfi_free_mtmr(mtmr);
356 return 0;
357}
358
359#ifdef CONFIG_SMP
360/* Should be called with per cpu */
361void apbt_setup_secondary_clock(void)
362{
363 struct apbt_dev *adev;
364 struct clock_event_device *aevt;
365 int cpu;
366
367 /* Don't register boot CPU clockevent */
368 cpu = smp_processor_id();
369 if (cpu == boot_cpu_id)
370 return;
371 /*
372 * We need to calculate the scaled math multiplication factor for
373 * nanosecond to apbt tick conversion.
374 * mult = (nsec/cycle)*2^APBT_SHIFT
375 */
376 printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
377 adev = &per_cpu(cpu_apbt_dev, cpu);
378 aevt = &adev->evt;
379
380 memcpy(aevt, &apbt_clockevent, sizeof(*aevt));
381 aevt->cpumask = cpumask_of(cpu);
382 aevt->name = adev->name;
383 aevt->mode = CLOCK_EVT_MODE_UNUSED;
384
385 printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n",
386 cpu, aevt->name, *(u32 *)aevt->cpumask);
387
388 apbt_setup_irq(adev);
389
390 clockevents_register_device(aevt);
391
392 apbt_enable_int(cpu);
393
394 return;
395}
396
397/*
398 * this notify handler process CPU hotplug events. in case of S0i3, nonboot
399 * cpus are disabled/enabled frequently, for performance reasons, we keep the
400 * per cpu timer irq registered so that we do need to do free_irq/request_irq.
401 *
402 * TODO: it might be more reliable to directly disable percpu clockevent device
403 * without the notifier chain. currently, cpu 0 may get interrupts from other
404 * cpu timers during the offline process due to the ordering of notification.
405 * the extra interrupt is harmless.
406 */
407static int apbt_cpuhp_notify(struct notifier_block *n,
408 unsigned long action, void *hcpu)
409{
410 unsigned long cpu = (unsigned long)hcpu;
411 struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
412
413 switch (action & 0xf) {
414 case CPU_DEAD:
415 apbt_disable_int(cpu);
416 if (system_state == SYSTEM_RUNNING)
417 pr_debug("skipping APBT CPU %lu offline\n", cpu);
418 else if (adev) {
419 pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
420 free_irq(adev->irq, adev);
421 }
422 break;
423 default:
424 pr_debug(KERN_INFO "APBT notified %lu, no action\n", action);
425 }
426 return NOTIFY_OK;
427}
428
429static __init int apbt_late_init(void)
430{
431 if (disable_apbt_percpu)
432 return 0;
433 /* This notifier should be called after workqueue is ready */
434 hotcpu_notifier(apbt_cpuhp_notify, -20);
435 return 0;
436}
437fs_initcall(apbt_late_init);
438#else
439
440void apbt_setup_secondary_clock(void) {}
441
442#endif /* CONFIG_SMP */
443
444static void apbt_set_mode(enum clock_event_mode mode,
445 struct clock_event_device *evt)
446{
447 unsigned long ctrl;
448 uint64_t delta;
449 int timer_num;
450 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
451
452 timer_num = adev->num;
453 pr_debug("%s CPU %d timer %d mode=%d\n",
454 __func__, first_cpu(*evt->cpumask), timer_num, mode);
455
456 switch (mode) {
457 case CLOCK_EVT_MODE_PERIODIC:
458 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
459 delta >>= apbt_clockevent.shift;
460 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
461 ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
462 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
463 /*
464 * DW APB p. 46, have to disable timer before load counter,
465 * may cause sync problem.
466 */
467 ctrl &= ~APBTMR_CONTROL_ENABLE;
468 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
469 udelay(1);
470 pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
471 apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
472 ctrl |= APBTMR_CONTROL_ENABLE;
473 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
474 break;
475 /* APB timer does not have one-shot mode, use free running mode */
476 case CLOCK_EVT_MODE_ONESHOT:
477 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
478 /*
479 * set free running mode, this mode will let timer reload max
480 * timeout which will give time (3min on 25MHz clock) to rearm
481 * the next event, therefore emulate the one-shot mode.
482 */
483 ctrl &= ~APBTMR_CONTROL_ENABLE;
484 ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
485
486 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
487 /* write again to set free running mode */
488 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
489
490 /*
491 * DW APB p. 46, load counter with all 1s before starting free
492 * running mode.
493 */
494 apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
495 ctrl &= ~APBTMR_CONTROL_INT;
496 ctrl |= APBTMR_CONTROL_ENABLE;
497 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
498 break;
499
500 case CLOCK_EVT_MODE_UNUSED:
501 case CLOCK_EVT_MODE_SHUTDOWN:
502 apbt_disable_int(timer_num);
503 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
504 ctrl &= ~APBTMR_CONTROL_ENABLE;
505 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
506 break;
507
508 case CLOCK_EVT_MODE_RESUME:
509 apbt_enable_int(timer_num);
510 break;
511 }
512}
513
514static int apbt_next_event(unsigned long delta,
515 struct clock_event_device *evt)
516{
517 unsigned long ctrl;
518 int timer_num;
519
520 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
521
522 timer_num = adev->num;
523 /* Disable timer */
524 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
525 ctrl &= ~APBTMR_CONTROL_ENABLE;
526 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
527 /* write new count */
528 apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
529 ctrl |= APBTMR_CONTROL_ENABLE;
530 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
531 return 0;
532}
533
534/*
535 * APB timer clock is not in sync with pclk on Langwell, which translates to
536 * unreliable read value caused by sampling error. the error does not add up
537 * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
538 * would go backwards. the following code is trying to prevent time traveling
539 * backwards. little bit paranoid.
540 */
541static cycle_t apbt_read_clocksource(struct clocksource *cs)
542{
543 unsigned long t0, t1, t2;
544 static unsigned long last_read;
545
546bad_count:
547 t1 = apbt_readl(phy_cs_timer_id,
548 APBTMR_N_CURRENT_VALUE);
549 t2 = apbt_readl(phy_cs_timer_id,
550 APBTMR_N_CURRENT_VALUE);
551 if (unlikely(t1 < t2)) {
552 pr_debug("APBT: read current count error %lx:%lx:%lx\n",
553 t1, t2, t2 - t1);
554 goto bad_count;
555 }
556 /*
557 * check against cached last read, makes sure time does not go back.
558 * it could be a normal rollover but we will do tripple check anyway
559 */
560 if (unlikely(t2 > last_read)) {
561 /* check if we have a normal rollover */
562 unsigned long raw_intr_status =
563 apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
564 /*
565 * cs timer interrupt is masked but raw intr bit is set if
566 * rollover occurs. then we read EOI reg to clear it.
567 */
568 if (raw_intr_status & (1 << phy_cs_timer_id)) {
569 apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
570 goto out;
571 }
572 pr_debug("APB CS going back %lx:%lx:%lx ",
573 t2, last_read, t2 - last_read);
574bad_count_x3:
575 pr_debug(KERN_INFO "tripple check enforced\n");
576 t0 = apbt_readl(phy_cs_timer_id,
577 APBTMR_N_CURRENT_VALUE);
578 udelay(1);
579 t1 = apbt_readl(phy_cs_timer_id,
580 APBTMR_N_CURRENT_VALUE);
581 udelay(1);
582 t2 = apbt_readl(phy_cs_timer_id,
583 APBTMR_N_CURRENT_VALUE);
584 if ((t2 > t1) || (t1 > t0)) {
585 printk(KERN_ERR "Error: APB CS tripple check failed\n");
586 goto bad_count_x3;
587 }
588 }
589out:
590 last_read = t2;
591 return (cycle_t)~t2;
592}
593
594static int apbt_clocksource_register(void)
595{
596 u64 start, now;
597 cycle_t t1;
598
599 /* Start the counter, use timer 2 as source, timer 0/1 for event */
600 apbt_start_counter(phy_cs_timer_id);
601
602 /* Verify whether apbt counter works */
603 t1 = apbt_read_clocksource(&clocksource_apbt);
604 rdtscll(start);
605
606 /*
607 * We don't know the TSC frequency yet, but waiting for
608 * 200000 TSC cycles is safe:
609 * 4 GHz == 50us
610 * 1 GHz == 200us
611 */
612 do {
613 rep_nop();
614 rdtscll(now);
615 } while ((now - start) < 200000UL);
616
617 /* APBT is the only always on clocksource, it has to work! */
618 if (t1 == apbt_read_clocksource(&clocksource_apbt))
619 panic("APBT counter not counting. APBT disabled\n");
620
621 /*
622 * initialize and register APBT clocksource
623 * convert that to ns/clock cycle
624 * mult = (ns/c) * 2^APBT_SHIFT
625 */
626 clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
627 (unsigned long) apbt_freq, APBT_SHIFT);
628 clocksource_register(&clocksource_apbt);
629
630 return 0;
631}
632
633/*
634 * Early setup the APBT timer, only use timer 0 for booting then switch to
635 * per CPU timer if possible.
636 * returns 1 if per cpu apbt is setup
637 * returns 0 if no per cpu apbt is chosen
638 * panic if set up failed, this is the only platform timer on Moorestown.
639 */
640void __init apbt_time_init(void)
641{
642#ifdef CONFIG_SMP
643 int i;
644 struct sfi_timer_table_entry *p_mtmr;
645 unsigned int percpu_timer;
646 struct apbt_dev *adev;
647#endif
648
649 if (apb_timer_block_enabled)
650 return;
651 apbt_set_mapping();
652 if (apbt_virt_address) {
653 pr_debug("Found APBT version 0x%lx\n",\
654 apbt_readl_reg(APBTMRS_COMP_VERSION));
655 } else
656 goto out_noapbt;
657 /*
658 * Read the frequency and check for a sane value, for ESL model
659 * we extend the possible clock range to allow time scaling.
660 */
661
662 if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
663 pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq);
664 goto out_noapbt;
665 }
666 if (apbt_clocksource_register()) {
667 pr_debug("APBT has failed to register clocksource\n");
668 goto out_noapbt;
669 }
670 if (!apbt_clockevent_register())
671 apb_timer_block_enabled = 1;
672 else {
673 pr_debug("APBT has failed to register clockevent\n");
674 goto out_noapbt;
675 }
676#ifdef CONFIG_SMP
677 /* kernel cmdline disable apb timer, so we will use lapic timers */
678 if (disable_apbt_percpu) {
679 printk(KERN_INFO "apbt: disabled per cpu timer\n");
680 return;
681 }
682 pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
683 if (num_possible_cpus() <= sfi_mtimer_num) {
684 percpu_timer = 1;
685 apbt_num_timers_used = num_possible_cpus();
686 } else {
687 percpu_timer = 0;
688 apbt_num_timers_used = 1;
689 adev = &per_cpu(cpu_apbt_dev, 0);
690 adev->flags &= ~APBT_DEV_USED;
691 }
692 pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
693
694 /* here we set up per CPU timer data structure */
695 apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
696 GFP_KERNEL);
697 if (!apbt_devs) {
698 printk(KERN_ERR "Failed to allocate APB timer devices\n");
699 return;
700 }
701 for (i = 0; i < apbt_num_timers_used; i++) {
702 adev = &per_cpu(cpu_apbt_dev, i);
703 adev->num = i;
704 adev->cpu = i;
705 p_mtmr = sfi_get_mtmr(i);
706 if (p_mtmr) {
707 adev->tick = p_mtmr->freq_hz;
708 adev->irq = p_mtmr->irq;
709 } else
710 printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
711 adev->count = 0;
712 sprintf(adev->name, "apbt%d", i);
713 }
714#endif
715
716 return;
717
718out_noapbt:
719 apbt_clear_mapping();
720 apb_timer_block_enabled = 0;
721 panic("failed to enable APB timer\n");
722}
723
724static inline void apbt_disable(int n)
725{
726 if (is_apbt_capable()) {
727 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
728 ctrl &= ~APBTMR_CONTROL_ENABLE;
729 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
730 }
731}
732
733/* called before apb_timer_enable, use early map */
734unsigned long apbt_quick_calibrate()
735{
736 int i, scale;
737 u64 old, new;
738 cycle_t t1, t2;
739 unsigned long khz = 0;
740 u32 loop, shift;
741
742 apbt_set_mapping();
743 apbt_start_counter(phy_cs_timer_id);
744
745 /* check if the timer can count down, otherwise return */
746 old = apbt_read_clocksource(&clocksource_apbt);
747 i = 10000;
748 while (--i) {
749 if (old != apbt_read_clocksource(&clocksource_apbt))
750 break;
751 }
752 if (!i)
753 goto failed;
754
755 /* count 16 ms */
756 loop = (apbt_freq * 1000) << 4;
757
758 /* restart the timer to ensure it won't get to 0 in the calibration */
759 apbt_start_counter(phy_cs_timer_id);
760
761 old = apbt_read_clocksource(&clocksource_apbt);
762 old += loop;
763
764 t1 = __native_read_tsc();
765
766 do {
767 new = apbt_read_clocksource(&clocksource_apbt);
768 } while (new < old);
769
770 t2 = __native_read_tsc();
771
772 shift = 5;
773 if (unlikely(loop >> shift == 0)) {
774 printk(KERN_INFO
775 "APBT TSC calibration failed, not enough resolution\n");
776 return 0;
777 }
778 scale = (int)div_u64((t2 - t1), loop >> shift);
779 khz = (scale * apbt_freq * 1000) >> shift;
780 printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
781 return khz;
782failed:
783 return 0;
784}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index f147a95fd84a..3704997e8b25 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -31,7 +31,6 @@
31#include <asm/x86_init.h> 31#include <asm/x86_init.h>
32 32
33int gart_iommu_aperture; 33int gart_iommu_aperture;
34EXPORT_SYMBOL_GPL(gart_iommu_aperture);
35int gart_iommu_aperture_disabled __initdata; 34int gart_iommu_aperture_disabled __initdata;
36int gart_iommu_aperture_allowed __initdata; 35int gart_iommu_aperture_allowed __initdata;
37 36
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index dfca210f6a10..00187f1fcfb7 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -581,7 +581,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
581 res = (((u64)(*deltatsc)) * pm_100ms); 581 res = (((u64)(*deltatsc)) * pm_100ms);
582 do_div(res, deltapm); 582 do_div(res, deltapm);
583 apic_printk(APIC_VERBOSE, "TSC delta adjusted to " 583 apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
584 "PM-Timer: %lu (%ld) \n", 584 "PM-Timer: %lu (%ld)\n",
585 (unsigned long)res, *deltatsc); 585 (unsigned long)res, *deltatsc);
586 *deltatsc = (long)res; 586 *deltatsc = (long)res;
587 } 587 }
@@ -1390,7 +1390,7 @@ void __init enable_IR_x2apic(void)
1390 } 1390 }
1391 1391
1392 local_irq_save(flags); 1392 local_irq_save(flags);
1393 mask_8259A(); 1393 legacy_pic->mask_all();
1394 mask_IO_APIC_setup(ioapic_entries); 1394 mask_IO_APIC_setup(ioapic_entries);
1395 1395
1396 if (dmar_table_init_ret) 1396 if (dmar_table_init_ret)
@@ -1422,7 +1422,7 @@ void __init enable_IR_x2apic(void)
1422nox2apic: 1422nox2apic:
1423 if (!ret) /* IR enabling failed */ 1423 if (!ret) /* IR enabling failed */
1424 restore_IO_APIC_setup(ioapic_entries); 1424 restore_IO_APIC_setup(ioapic_entries);
1425 unmask_8259A(); 1425 legacy_pic->restore_mask();
1426 local_irq_restore(flags); 1426 local_irq_restore(flags);
1427 1427
1428out: 1428out:
@@ -2018,7 +2018,7 @@ static int lapic_resume(struct sys_device *dev)
2018 } 2018 }
2019 2019
2020 mask_IO_APIC_setup(ioapic_entries); 2020 mask_IO_APIC_setup(ioapic_entries);
2021 mask_8259A(); 2021 legacy_pic->mask_all();
2022 } 2022 }
2023 2023
2024 if (x2apic_mode) 2024 if (x2apic_mode)
@@ -2062,7 +2062,7 @@ static int lapic_resume(struct sys_device *dev)
2062 2062
2063 if (intr_remapping_enabled) { 2063 if (intr_remapping_enabled) {
2064 reenable_intr_remapping(x2apic_mode); 2064 reenable_intr_remapping(x2apic_mode);
2065 unmask_8259A(); 2065 legacy_pic->restore_mask();
2066 restore_IO_APIC_setup(ioapic_entries); 2066 restore_IO_APIC_setup(ioapic_entries);
2067 free_ioapic_entries(ioapic_entries); 2067 free_ioapic_entries(ioapic_entries);
2068 } 2068 }
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index e3c3d820c325..09d3b17ce0c2 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -223,7 +223,7 @@ struct apic apic_flat = {
223}; 223};
224 224
225/* 225/*
226 * Physflat mode is used when there are more than 8 CPUs on a AMD system. 226 * Physflat mode is used when there are more than 8 CPUs on a system.
227 * We cannot use logical delivery in this case because the mask 227 * We cannot use logical delivery in this case because the mask
228 * overflows, so use physical mode. 228 * overflows, so use physical mode.
229 */ 229 */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e93a76bc8670..e4e0ddcb1546 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -73,8 +73,8 @@
73 */ 73 */
74int sis_apic_bug = -1; 74int sis_apic_bug = -1;
75 75
76static DEFINE_SPINLOCK(ioapic_lock); 76static DEFINE_RAW_SPINLOCK(ioapic_lock);
77static DEFINE_SPINLOCK(vector_lock); 77static DEFINE_RAW_SPINLOCK(vector_lock);
78 78
79/* 79/*
80 * # of IRQ routing registers 80 * # of IRQ routing registers
@@ -94,8 +94,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
94/* # of MP IRQ source entries */ 94/* # of MP IRQ source entries */
95int mp_irq_entries; 95int mp_irq_entries;
96 96
97/* Number of legacy interrupts */
98static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY;
99/* GSI interrupts */ 97/* GSI interrupts */
100static int nr_irqs_gsi = NR_IRQS_LEGACY; 98static int nr_irqs_gsi = NR_IRQS_LEGACY;
101 99
@@ -140,33 +138,10 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
140 138
141/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 139/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
142#ifdef CONFIG_SPARSE_IRQ 140#ifdef CONFIG_SPARSE_IRQ
143static struct irq_cfg irq_cfgx[] = { 141static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
144#else 142#else
145static struct irq_cfg irq_cfgx[NR_IRQS] = { 143static struct irq_cfg irq_cfgx[NR_IRQS];
146#endif 144#endif
147 [0] = { .vector = IRQ0_VECTOR, },
148 [1] = { .vector = IRQ1_VECTOR, },
149 [2] = { .vector = IRQ2_VECTOR, },
150 [3] = { .vector = IRQ3_VECTOR, },
151 [4] = { .vector = IRQ4_VECTOR, },
152 [5] = { .vector = IRQ5_VECTOR, },
153 [6] = { .vector = IRQ6_VECTOR, },
154 [7] = { .vector = IRQ7_VECTOR, },
155 [8] = { .vector = IRQ8_VECTOR, },
156 [9] = { .vector = IRQ9_VECTOR, },
157 [10] = { .vector = IRQ10_VECTOR, },
158 [11] = { .vector = IRQ11_VECTOR, },
159 [12] = { .vector = IRQ12_VECTOR, },
160 [13] = { .vector = IRQ13_VECTOR, },
161 [14] = { .vector = IRQ14_VECTOR, },
162 [15] = { .vector = IRQ15_VECTOR, },
163};
164
165void __init io_apic_disable_legacy(void)
166{
167 nr_legacy_irqs = 0;
168 nr_irqs_gsi = 0;
169}
170 145
171int __init arch_early_irq_init(void) 146int __init arch_early_irq_init(void)
172{ 147{
@@ -176,6 +151,11 @@ int __init arch_early_irq_init(void)
176 int node; 151 int node;
177 int i; 152 int i;
178 153
154 if (!legacy_pic->nr_legacy_irqs) {
155 nr_irqs_gsi = 0;
156 io_apic_irqs = ~0UL;
157 }
158
179 cfg = irq_cfgx; 159 cfg = irq_cfgx;
180 count = ARRAY_SIZE(irq_cfgx); 160 count = ARRAY_SIZE(irq_cfgx);
181 node= cpu_to_node(boot_cpu_id); 161 node= cpu_to_node(boot_cpu_id);
@@ -185,8 +165,14 @@ int __init arch_early_irq_init(void)
185 desc->chip_data = &cfg[i]; 165 desc->chip_data = &cfg[i];
186 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); 166 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
187 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); 167 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
188 if (i < nr_legacy_irqs) 168 /*
189 cpumask_setall(cfg[i].domain); 169 * For legacy IRQ's, start with assigning irq0 to irq15 to
170 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
171 */
172 if (i < legacy_pic->nr_legacy_irqs) {
173 cfg[i].vector = IRQ0_VECTOR + i;
174 cpumask_set_cpu(0, cfg[i].domain);
175 }
190 } 176 }
191 177
192 return 0; 178 return 0;
@@ -406,7 +392,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
406 struct irq_pin_list *entry; 392 struct irq_pin_list *entry;
407 unsigned long flags; 393 unsigned long flags;
408 394
409 spin_lock_irqsave(&ioapic_lock, flags); 395 raw_spin_lock_irqsave(&ioapic_lock, flags);
410 for_each_irq_pin(entry, cfg->irq_2_pin) { 396 for_each_irq_pin(entry, cfg->irq_2_pin) {
411 unsigned int reg; 397 unsigned int reg;
412 int pin; 398 int pin;
@@ -415,11 +401,11 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
415 reg = io_apic_read(entry->apic, 0x10 + pin*2); 401 reg = io_apic_read(entry->apic, 0x10 + pin*2);
416 /* Is the remote IRR bit set? */ 402 /* Is the remote IRR bit set? */
417 if (reg & IO_APIC_REDIR_REMOTE_IRR) { 403 if (reg & IO_APIC_REDIR_REMOTE_IRR) {
418 spin_unlock_irqrestore(&ioapic_lock, flags); 404 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
419 return true; 405 return true;
420 } 406 }
421 } 407 }
422 spin_unlock_irqrestore(&ioapic_lock, flags); 408 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
423 409
424 return false; 410 return false;
425} 411}
@@ -433,10 +419,10 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
433{ 419{
434 union entry_union eu; 420 union entry_union eu;
435 unsigned long flags; 421 unsigned long flags;
436 spin_lock_irqsave(&ioapic_lock, flags); 422 raw_spin_lock_irqsave(&ioapic_lock, flags);
437 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); 423 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
438 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); 424 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
439 spin_unlock_irqrestore(&ioapic_lock, flags); 425 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
440 return eu.entry; 426 return eu.entry;
441} 427}
442 428
@@ -459,9 +445,9 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
459void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 445void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
460{ 446{
461 unsigned long flags; 447 unsigned long flags;
462 spin_lock_irqsave(&ioapic_lock, flags); 448 raw_spin_lock_irqsave(&ioapic_lock, flags);
463 __ioapic_write_entry(apic, pin, e); 449 __ioapic_write_entry(apic, pin, e);
464 spin_unlock_irqrestore(&ioapic_lock, flags); 450 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
465} 451}
466 452
467/* 453/*
@@ -474,10 +460,10 @@ static void ioapic_mask_entry(int apic, int pin)
474 unsigned long flags; 460 unsigned long flags;
475 union entry_union eu = { .entry.mask = 1 }; 461 union entry_union eu = { .entry.mask = 1 };
476 462
477 spin_lock_irqsave(&ioapic_lock, flags); 463 raw_spin_lock_irqsave(&ioapic_lock, flags);
478 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 464 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
479 io_apic_write(apic, 0x11 + 2*pin, eu.w2); 465 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
480 spin_unlock_irqrestore(&ioapic_lock, flags); 466 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
481} 467}
482 468
483/* 469/*
@@ -604,9 +590,9 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
604 590
605 BUG_ON(!cfg); 591 BUG_ON(!cfg);
606 592
607 spin_lock_irqsave(&ioapic_lock, flags); 593 raw_spin_lock_irqsave(&ioapic_lock, flags);
608 __mask_IO_APIC_irq(cfg); 594 __mask_IO_APIC_irq(cfg);
609 spin_unlock_irqrestore(&ioapic_lock, flags); 595 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
610} 596}
611 597
612static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) 598static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
@@ -614,9 +600,9 @@ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
614 struct irq_cfg *cfg = desc->chip_data; 600 struct irq_cfg *cfg = desc->chip_data;
615 unsigned long flags; 601 unsigned long flags;
616 602
617 spin_lock_irqsave(&ioapic_lock, flags); 603 raw_spin_lock_irqsave(&ioapic_lock, flags);
618 __unmask_IO_APIC_irq(cfg); 604 __unmask_IO_APIC_irq(cfg);
619 spin_unlock_irqrestore(&ioapic_lock, flags); 605 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
620} 606}
621 607
622static void mask_IO_APIC_irq(unsigned int irq) 608static void mask_IO_APIC_irq(unsigned int irq)
@@ -865,7 +851,7 @@ static int __init find_isa_irq_apic(int irq, int type)
865 */ 851 */
866static int EISA_ELCR(unsigned int irq) 852static int EISA_ELCR(unsigned int irq)
867{ 853{
868 if (irq < nr_legacy_irqs) { 854 if (irq < legacy_pic->nr_legacy_irqs) {
869 unsigned int port = 0x4d0 + (irq >> 3); 855 unsigned int port = 0x4d0 + (irq >> 3);
870 return (inb(port) >> (irq & 7)) & 1; 856 return (inb(port) >> (irq & 7)) & 1;
871 } 857 }
@@ -1140,12 +1126,12 @@ void lock_vector_lock(void)
1140 /* Used to the online set of cpus does not change 1126 /* Used to the online set of cpus does not change
1141 * during assign_irq_vector. 1127 * during assign_irq_vector.
1142 */ 1128 */
1143 spin_lock(&vector_lock); 1129 raw_spin_lock(&vector_lock);
1144} 1130}
1145 1131
1146void unlock_vector_lock(void) 1132void unlock_vector_lock(void)
1147{ 1133{
1148 spin_unlock(&vector_lock); 1134 raw_spin_unlock(&vector_lock);
1149} 1135}
1150 1136
1151static int 1137static int
@@ -1162,7 +1148,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1162 * Also, we've got to be careful not to trash gate 1148 * Also, we've got to be careful not to trash gate
1163 * 0x80, because int 0x80 is hm, kind of importantish. ;) 1149 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1164 */ 1150 */
1165 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; 1151 static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
1152 static int current_offset = VECTOR_OFFSET_START % 8;
1166 unsigned int old_vector; 1153 unsigned int old_vector;
1167 int cpu, err; 1154 int cpu, err;
1168 cpumask_var_t tmp_mask; 1155 cpumask_var_t tmp_mask;
@@ -1198,7 +1185,7 @@ next:
1198 if (vector >= first_system_vector) { 1185 if (vector >= first_system_vector) {
1199 /* If out of vectors on large boxen, must share them. */ 1186 /* If out of vectors on large boxen, must share them. */
1200 offset = (offset + 1) % 8; 1187 offset = (offset + 1) % 8;
1201 vector = FIRST_DEVICE_VECTOR + offset; 1188 vector = FIRST_EXTERNAL_VECTOR + offset;
1202 } 1189 }
1203 if (unlikely(current_vector == vector)) 1190 if (unlikely(current_vector == vector))
1204 continue; 1191 continue;
@@ -1232,9 +1219,9 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1232 int err; 1219 int err;
1233 unsigned long flags; 1220 unsigned long flags;
1234 1221
1235 spin_lock_irqsave(&vector_lock, flags); 1222 raw_spin_lock_irqsave(&vector_lock, flags);
1236 err = __assign_irq_vector(irq, cfg, mask); 1223 err = __assign_irq_vector(irq, cfg, mask);
1237 spin_unlock_irqrestore(&vector_lock, flags); 1224 raw_spin_unlock_irqrestore(&vector_lock, flags);
1238 return err; 1225 return err;
1239} 1226}
1240 1227
@@ -1268,11 +1255,16 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1268void __setup_vector_irq(int cpu) 1255void __setup_vector_irq(int cpu)
1269{ 1256{
1270 /* Initialize vector_irq on a new cpu */ 1257 /* Initialize vector_irq on a new cpu */
1271 /* This function must be called with vector_lock held */
1272 int irq, vector; 1258 int irq, vector;
1273 struct irq_cfg *cfg; 1259 struct irq_cfg *cfg;
1274 struct irq_desc *desc; 1260 struct irq_desc *desc;
1275 1261
1262 /*
1263 * vector_lock will make sure that we don't run into irq vector
1264 * assignments that might be happening on another cpu in parallel,
1265 * while we setup our initial vector to irq mappings.
1266 */
1267 raw_spin_lock(&vector_lock);
1276 /* Mark the inuse vectors */ 1268 /* Mark the inuse vectors */
1277 for_each_irq_desc(irq, desc) { 1269 for_each_irq_desc(irq, desc) {
1278 cfg = desc->chip_data; 1270 cfg = desc->chip_data;
@@ -1291,6 +1283,7 @@ void __setup_vector_irq(int cpu)
1291 if (!cpumask_test_cpu(cpu, cfg->domain)) 1283 if (!cpumask_test_cpu(cpu, cfg->domain))
1292 per_cpu(vector_irq, cpu)[vector] = -1; 1284 per_cpu(vector_irq, cpu)[vector] = -1;
1293 } 1285 }
1286 raw_spin_unlock(&vector_lock);
1294} 1287}
1295 1288
1296static struct irq_chip ioapic_chip; 1289static struct irq_chip ioapic_chip;
@@ -1440,6 +1433,14 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1440 1433
1441 cfg = desc->chip_data; 1434 cfg = desc->chip_data;
1442 1435
1436 /*
1437 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
1438 * controllers like 8259. Now that IO-APIC can handle this irq, update
1439 * the cfg->domain.
1440 */
1441 if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
1442 apic->vector_allocation_domain(0, cfg->domain);
1443
1443 if (assign_irq_vector(irq, cfg, apic->target_cpus())) 1444 if (assign_irq_vector(irq, cfg, apic->target_cpus()))
1444 return; 1445 return;
1445 1446
@@ -1461,8 +1462,8 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1461 } 1462 }
1462 1463
1463 ioapic_register_intr(irq, desc, trigger); 1464 ioapic_register_intr(irq, desc, trigger);
1464 if (irq < nr_legacy_irqs) 1465 if (irq < legacy_pic->nr_legacy_irqs)
1465 disable_8259A_irq(irq); 1466 legacy_pic->chip->mask(irq);
1466 1467
1467 ioapic_write_entry(apic_id, pin, entry); 1468 ioapic_write_entry(apic_id, pin, entry);
1468} 1469}
@@ -1473,7 +1474,7 @@ static struct {
1473 1474
1474static void __init setup_IO_APIC_irqs(void) 1475static void __init setup_IO_APIC_irqs(void)
1475{ 1476{
1476 int apic_id = 0, pin, idx, irq; 1477 int apic_id, pin, idx, irq;
1477 int notcon = 0; 1478 int notcon = 0;
1478 struct irq_desc *desc; 1479 struct irq_desc *desc;
1479 struct irq_cfg *cfg; 1480 struct irq_cfg *cfg;
@@ -1481,14 +1482,7 @@ static void __init setup_IO_APIC_irqs(void)
1481 1482
1482 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1483 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1483 1484
1484#ifdef CONFIG_ACPI 1485 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1485 if (!acpi_disabled && acpi_ioapic) {
1486 apic_id = mp_find_ioapic(0);
1487 if (apic_id < 0)
1488 apic_id = 0;
1489 }
1490#endif
1491
1492 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1486 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1493 idx = find_irq_entry(apic_id, pin, mp_INT); 1487 idx = find_irq_entry(apic_id, pin, mp_INT);
1494 if (idx == -1) { 1488 if (idx == -1) {
@@ -1510,6 +1504,9 @@ static void __init setup_IO_APIC_irqs(void)
1510 1504
1511 irq = pin_2_irq(idx, apic_id, pin); 1505 irq = pin_2_irq(idx, apic_id, pin);
1512 1506
1507 if ((apic_id > 0) && (irq > 16))
1508 continue;
1509
1513 /* 1510 /*
1514 * Skip the timer IRQ if there's a quirk handler 1511 * Skip the timer IRQ if there's a quirk handler
1515 * installed and if it returns 1: 1512 * installed and if it returns 1:
@@ -1651,14 +1648,14 @@ __apicdebuginit(void) print_IO_APIC(void)
1651 1648
1652 for (apic = 0; apic < nr_ioapics; apic++) { 1649 for (apic = 0; apic < nr_ioapics; apic++) {
1653 1650
1654 spin_lock_irqsave(&ioapic_lock, flags); 1651 raw_spin_lock_irqsave(&ioapic_lock, flags);
1655 reg_00.raw = io_apic_read(apic, 0); 1652 reg_00.raw = io_apic_read(apic, 0);
1656 reg_01.raw = io_apic_read(apic, 1); 1653 reg_01.raw = io_apic_read(apic, 1);
1657 if (reg_01.bits.version >= 0x10) 1654 if (reg_01.bits.version >= 0x10)
1658 reg_02.raw = io_apic_read(apic, 2); 1655 reg_02.raw = io_apic_read(apic, 2);
1659 if (reg_01.bits.version >= 0x20) 1656 if (reg_01.bits.version >= 0x20)
1660 reg_03.raw = io_apic_read(apic, 3); 1657 reg_03.raw = io_apic_read(apic, 3);
1661 spin_unlock_irqrestore(&ioapic_lock, flags); 1658 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1662 1659
1663 printk("\n"); 1660 printk("\n");
1664 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); 1661 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
@@ -1697,7 +1694,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1697 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1694 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1698 1695
1699 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" 1696 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
1700 " Stat Dmod Deli Vect: \n"); 1697 " Stat Dmod Deli Vect:\n");
1701 1698
1702 for (i = 0; i <= reg_01.bits.entries; i++) { 1699 for (i = 0; i <= reg_01.bits.entries; i++) {
1703 struct IO_APIC_route_entry entry; 1700 struct IO_APIC_route_entry entry;
@@ -1875,12 +1872,12 @@ __apicdebuginit(void) print_PIC(void)
1875 unsigned int v; 1872 unsigned int v;
1876 unsigned long flags; 1873 unsigned long flags;
1877 1874
1878 if (!nr_legacy_irqs) 1875 if (!legacy_pic->nr_legacy_irqs)
1879 return; 1876 return;
1880 1877
1881 printk(KERN_DEBUG "\nprinting PIC contents\n"); 1878 printk(KERN_DEBUG "\nprinting PIC contents\n");
1882 1879
1883 spin_lock_irqsave(&i8259A_lock, flags); 1880 raw_spin_lock_irqsave(&i8259A_lock, flags);
1884 1881
1885 v = inb(0xa1) << 8 | inb(0x21); 1882 v = inb(0xa1) << 8 | inb(0x21);
1886 printk(KERN_DEBUG "... PIC IMR: %04x\n", v); 1883 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
@@ -1894,7 +1891,7 @@ __apicdebuginit(void) print_PIC(void)
1894 outb(0x0a,0xa0); 1891 outb(0x0a,0xa0);
1895 outb(0x0a,0x20); 1892 outb(0x0a,0x20);
1896 1893
1897 spin_unlock_irqrestore(&i8259A_lock, flags); 1894 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
1898 1895
1899 printk(KERN_DEBUG "... PIC ISR: %04x\n", v); 1896 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1900 1897
@@ -1953,13 +1950,13 @@ void __init enable_IO_APIC(void)
1953 * The number of IO-APIC IRQ registers (== #pins): 1950 * The number of IO-APIC IRQ registers (== #pins):
1954 */ 1951 */
1955 for (apic = 0; apic < nr_ioapics; apic++) { 1952 for (apic = 0; apic < nr_ioapics; apic++) {
1956 spin_lock_irqsave(&ioapic_lock, flags); 1953 raw_spin_lock_irqsave(&ioapic_lock, flags);
1957 reg_01.raw = io_apic_read(apic, 1); 1954 reg_01.raw = io_apic_read(apic, 1);
1958 spin_unlock_irqrestore(&ioapic_lock, flags); 1955 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1959 nr_ioapic_registers[apic] = reg_01.bits.entries+1; 1956 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1960 } 1957 }
1961 1958
1962 if (!nr_legacy_irqs) 1959 if (!legacy_pic->nr_legacy_irqs)
1963 return; 1960 return;
1964 1961
1965 for(apic = 0; apic < nr_ioapics; apic++) { 1962 for(apic = 0; apic < nr_ioapics; apic++) {
@@ -2016,7 +2013,7 @@ void disable_IO_APIC(void)
2016 */ 2013 */
2017 clear_IO_APIC(); 2014 clear_IO_APIC();
2018 2015
2019 if (!nr_legacy_irqs) 2016 if (!legacy_pic->nr_legacy_irqs)
2020 return; 2017 return;
2021 2018
2022 /* 2019 /*
@@ -2095,9 +2092,9 @@ void __init setup_ioapic_ids_from_mpc(void)
2095 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { 2092 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
2096 2093
2097 /* Read the register 0 value */ 2094 /* Read the register 0 value */
2098 spin_lock_irqsave(&ioapic_lock, flags); 2095 raw_spin_lock_irqsave(&ioapic_lock, flags);
2099 reg_00.raw = io_apic_read(apic_id, 0); 2096 reg_00.raw = io_apic_read(apic_id, 0);
2100 spin_unlock_irqrestore(&ioapic_lock, flags); 2097 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2101 2098
2102 old_id = mp_ioapics[apic_id].apicid; 2099 old_id = mp_ioapics[apic_id].apicid;
2103 2100
@@ -2156,16 +2153,16 @@ void __init setup_ioapic_ids_from_mpc(void)
2156 mp_ioapics[apic_id].apicid); 2153 mp_ioapics[apic_id].apicid);
2157 2154
2158 reg_00.bits.ID = mp_ioapics[apic_id].apicid; 2155 reg_00.bits.ID = mp_ioapics[apic_id].apicid;
2159 spin_lock_irqsave(&ioapic_lock, flags); 2156 raw_spin_lock_irqsave(&ioapic_lock, flags);
2160 io_apic_write(apic_id, 0, reg_00.raw); 2157 io_apic_write(apic_id, 0, reg_00.raw);
2161 spin_unlock_irqrestore(&ioapic_lock, flags); 2158 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2162 2159
2163 /* 2160 /*
2164 * Sanity check 2161 * Sanity check
2165 */ 2162 */
2166 spin_lock_irqsave(&ioapic_lock, flags); 2163 raw_spin_lock_irqsave(&ioapic_lock, flags);
2167 reg_00.raw = io_apic_read(apic_id, 0); 2164 reg_00.raw = io_apic_read(apic_id, 0);
2168 spin_unlock_irqrestore(&ioapic_lock, flags); 2165 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2169 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) 2166 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
2170 printk("could not set ID!\n"); 2167 printk("could not set ID!\n");
2171 else 2168 else
@@ -2248,15 +2245,15 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2248 unsigned long flags; 2245 unsigned long flags;
2249 struct irq_cfg *cfg; 2246 struct irq_cfg *cfg;
2250 2247
2251 spin_lock_irqsave(&ioapic_lock, flags); 2248 raw_spin_lock_irqsave(&ioapic_lock, flags);
2252 if (irq < nr_legacy_irqs) { 2249 if (irq < legacy_pic->nr_legacy_irqs) {
2253 disable_8259A_irq(irq); 2250 legacy_pic->chip->mask(irq);
2254 if (i8259A_irq_pending(irq)) 2251 if (legacy_pic->irq_pending(irq))
2255 was_pending = 1; 2252 was_pending = 1;
2256 } 2253 }
2257 cfg = irq_cfg(irq); 2254 cfg = irq_cfg(irq);
2258 __unmask_IO_APIC_irq(cfg); 2255 __unmask_IO_APIC_irq(cfg);
2259 spin_unlock_irqrestore(&ioapic_lock, flags); 2256 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2260 2257
2261 return was_pending; 2258 return was_pending;
2262} 2259}
@@ -2267,9 +2264,9 @@ static int ioapic_retrigger_irq(unsigned int irq)
2267 struct irq_cfg *cfg = irq_cfg(irq); 2264 struct irq_cfg *cfg = irq_cfg(irq);
2268 unsigned long flags; 2265 unsigned long flags;
2269 2266
2270 spin_lock_irqsave(&vector_lock, flags); 2267 raw_spin_lock_irqsave(&vector_lock, flags);
2271 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); 2268 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
2272 spin_unlock_irqrestore(&vector_lock, flags); 2269 raw_spin_unlock_irqrestore(&vector_lock, flags);
2273 2270
2274 return 1; 2271 return 1;
2275} 2272}
@@ -2362,14 +2359,14 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2362 irq = desc->irq; 2359 irq = desc->irq;
2363 cfg = desc->chip_data; 2360 cfg = desc->chip_data;
2364 2361
2365 spin_lock_irqsave(&ioapic_lock, flags); 2362 raw_spin_lock_irqsave(&ioapic_lock, flags);
2366 ret = set_desc_affinity(desc, mask, &dest); 2363 ret = set_desc_affinity(desc, mask, &dest);
2367 if (!ret) { 2364 if (!ret) {
2368 /* Only the high 8 bits are valid. */ 2365 /* Only the high 8 bits are valid. */
2369 dest = SET_APIC_LOGICAL_ID(dest); 2366 dest = SET_APIC_LOGICAL_ID(dest);
2370 __target_IO_APIC_irq(irq, dest, cfg); 2367 __target_IO_APIC_irq(irq, dest, cfg);
2371 } 2368 }
2372 spin_unlock_irqrestore(&ioapic_lock, flags); 2369 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2373 2370
2374 return ret; 2371 return ret;
2375} 2372}
@@ -2604,9 +2601,9 @@ static void eoi_ioapic_irq(struct irq_desc *desc)
2604 irq = desc->irq; 2601 irq = desc->irq;
2605 cfg = desc->chip_data; 2602 cfg = desc->chip_data;
2606 2603
2607 spin_lock_irqsave(&ioapic_lock, flags); 2604 raw_spin_lock_irqsave(&ioapic_lock, flags);
2608 __eoi_ioapic_irq(irq, cfg); 2605 __eoi_ioapic_irq(irq, cfg);
2609 spin_unlock_irqrestore(&ioapic_lock, flags); 2606 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2610} 2607}
2611 2608
2612static void ack_apic_level(unsigned int irq) 2609static void ack_apic_level(unsigned int irq)
@@ -2784,8 +2781,8 @@ static inline void init_IO_APIC_traps(void)
2784 * so default to an old-fashioned 8259 2781 * so default to an old-fashioned 8259
2785 * interrupt if we can.. 2782 * interrupt if we can..
2786 */ 2783 */
2787 if (irq < nr_legacy_irqs) 2784 if (irq < legacy_pic->nr_legacy_irqs)
2788 make_8259A_irq(irq); 2785 legacy_pic->make_irq(irq);
2789 else 2786 else
2790 /* Strange. Oh, well.. */ 2787 /* Strange. Oh, well.. */
2791 desc->chip = &no_irq_chip; 2788 desc->chip = &no_irq_chip;
@@ -2942,7 +2939,7 @@ static inline void __init check_timer(void)
2942 /* 2939 /*
2943 * get/set the timer IRQ vector: 2940 * get/set the timer IRQ vector:
2944 */ 2941 */
2945 disable_8259A_irq(0); 2942 legacy_pic->chip->mask(0);
2946 assign_irq_vector(0, cfg, apic->target_cpus()); 2943 assign_irq_vector(0, cfg, apic->target_cpus());
2947 2944
2948 /* 2945 /*
@@ -2955,7 +2952,7 @@ static inline void __init check_timer(void)
2955 * automatically. 2952 * automatically.
2956 */ 2953 */
2957 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2954 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2958 init_8259A(1); 2955 legacy_pic->init(1);
2959#ifdef CONFIG_X86_32 2956#ifdef CONFIG_X86_32
2960 { 2957 {
2961 unsigned int ver; 2958 unsigned int ver;
@@ -3014,7 +3011,7 @@ static inline void __init check_timer(void)
3014 if (timer_irq_works()) { 3011 if (timer_irq_works()) {
3015 if (nmi_watchdog == NMI_IO_APIC) { 3012 if (nmi_watchdog == NMI_IO_APIC) {
3016 setup_nmi(); 3013 setup_nmi();
3017 enable_8259A_irq(0); 3014 legacy_pic->chip->unmask(0);
3018 } 3015 }
3019 if (disable_timer_pin_1 > 0) 3016 if (disable_timer_pin_1 > 0)
3020 clear_IO_APIC_pin(0, pin1); 3017 clear_IO_APIC_pin(0, pin1);
@@ -3037,14 +3034,14 @@ static inline void __init check_timer(void)
3037 */ 3034 */
3038 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); 3035 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
3039 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 3036 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
3040 enable_8259A_irq(0); 3037 legacy_pic->chip->unmask(0);
3041 if (timer_irq_works()) { 3038 if (timer_irq_works()) {
3042 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 3039 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
3043 timer_through_8259 = 1; 3040 timer_through_8259 = 1;
3044 if (nmi_watchdog == NMI_IO_APIC) { 3041 if (nmi_watchdog == NMI_IO_APIC) {
3045 disable_8259A_irq(0); 3042 legacy_pic->chip->mask(0);
3046 setup_nmi(); 3043 setup_nmi();
3047 enable_8259A_irq(0); 3044 legacy_pic->chip->unmask(0);
3048 } 3045 }
3049 goto out; 3046 goto out;
3050 } 3047 }
@@ -3052,7 +3049,7 @@ static inline void __init check_timer(void)
3052 * Cleanup, just in case ... 3049 * Cleanup, just in case ...
3053 */ 3050 */
3054 local_irq_disable(); 3051 local_irq_disable();
3055 disable_8259A_irq(0); 3052 legacy_pic->chip->mask(0);
3056 clear_IO_APIC_pin(apic2, pin2); 3053 clear_IO_APIC_pin(apic2, pin2);
3057 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); 3054 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
3058 } 3055 }
@@ -3071,22 +3068,22 @@ static inline void __init check_timer(void)
3071 3068
3072 lapic_register_intr(0, desc); 3069 lapic_register_intr(0, desc);
3073 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 3070 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
3074 enable_8259A_irq(0); 3071 legacy_pic->chip->unmask(0);
3075 3072
3076 if (timer_irq_works()) { 3073 if (timer_irq_works()) {
3077 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); 3074 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
3078 goto out; 3075 goto out;
3079 } 3076 }
3080 local_irq_disable(); 3077 local_irq_disable();
3081 disable_8259A_irq(0); 3078 legacy_pic->chip->mask(0);
3082 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 3079 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
3083 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); 3080 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
3084 3081
3085 apic_printk(APIC_QUIET, KERN_INFO 3082 apic_printk(APIC_QUIET, KERN_INFO
3086 "...trying to set up timer as ExtINT IRQ...\n"); 3083 "...trying to set up timer as ExtINT IRQ...\n");
3087 3084
3088 init_8259A(0); 3085 legacy_pic->init(0);
3089 make_8259A_irq(0); 3086 legacy_pic->make_irq(0);
3090 apic_write(APIC_LVT0, APIC_DM_EXTINT); 3087 apic_write(APIC_LVT0, APIC_DM_EXTINT);
3091 3088
3092 unlock_ExtINT_logic(); 3089 unlock_ExtINT_logic();
@@ -3128,7 +3125,7 @@ void __init setup_IO_APIC(void)
3128 /* 3125 /*
3129 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 3126 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
3130 */ 3127 */
3131 io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; 3128 io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
3132 3129
3133 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 3130 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
3134 /* 3131 /*
@@ -3139,7 +3136,7 @@ void __init setup_IO_APIC(void)
3139 sync_Arb_IDs(); 3136 sync_Arb_IDs();
3140 setup_IO_APIC_irqs(); 3137 setup_IO_APIC_irqs();
3141 init_IO_APIC_traps(); 3138 init_IO_APIC_traps();
3142 if (nr_legacy_irqs) 3139 if (legacy_pic->nr_legacy_irqs)
3143 check_timer(); 3140 check_timer();
3144} 3141}
3145 3142
@@ -3188,13 +3185,13 @@ static int ioapic_resume(struct sys_device *dev)
3188 data = container_of(dev, struct sysfs_ioapic_data, dev); 3185 data = container_of(dev, struct sysfs_ioapic_data, dev);
3189 entry = data->entry; 3186 entry = data->entry;
3190 3187
3191 spin_lock_irqsave(&ioapic_lock, flags); 3188 raw_spin_lock_irqsave(&ioapic_lock, flags);
3192 reg_00.raw = io_apic_read(dev->id, 0); 3189 reg_00.raw = io_apic_read(dev->id, 0);
3193 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { 3190 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
3194 reg_00.bits.ID = mp_ioapics[dev->id].apicid; 3191 reg_00.bits.ID = mp_ioapics[dev->id].apicid;
3195 io_apic_write(dev->id, 0, reg_00.raw); 3192 io_apic_write(dev->id, 0, reg_00.raw);
3196 } 3193 }
3197 spin_unlock_irqrestore(&ioapic_lock, flags); 3194 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3198 for (i = 0; i < nr_ioapic_registers[dev->id]; i++) 3195 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
3199 ioapic_write_entry(dev->id, i, entry[i]); 3196 ioapic_write_entry(dev->id, i, entry[i]);
3200 3197
@@ -3257,7 +3254,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3257 if (irq_want < nr_irqs_gsi) 3254 if (irq_want < nr_irqs_gsi)
3258 irq_want = nr_irqs_gsi; 3255 irq_want = nr_irqs_gsi;
3259 3256
3260 spin_lock_irqsave(&vector_lock, flags); 3257 raw_spin_lock_irqsave(&vector_lock, flags);
3261 for (new = irq_want; new < nr_irqs; new++) { 3258 for (new = irq_want; new < nr_irqs; new++) {
3262 desc_new = irq_to_desc_alloc_node(new, node); 3259 desc_new = irq_to_desc_alloc_node(new, node);
3263 if (!desc_new) { 3260 if (!desc_new) {
@@ -3276,7 +3273,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3276 irq = new; 3273 irq = new;
3277 break; 3274 break;
3278 } 3275 }
3279 spin_unlock_irqrestore(&vector_lock, flags); 3276 raw_spin_unlock_irqrestore(&vector_lock, flags);
3280 3277
3281 if (irq > 0) 3278 if (irq > 0)
3282 dynamic_irq_init_keep_chip_data(irq); 3279 dynamic_irq_init_keep_chip_data(irq);
@@ -3306,9 +3303,9 @@ void destroy_irq(unsigned int irq)
3306 dynamic_irq_cleanup_keep_chip_data(irq); 3303 dynamic_irq_cleanup_keep_chip_data(irq);
3307 3304
3308 free_irte(irq); 3305 free_irte(irq);
3309 spin_lock_irqsave(&vector_lock, flags); 3306 raw_spin_lock_irqsave(&vector_lock, flags);
3310 __clear_irq_vector(irq, get_irq_chip_data(irq)); 3307 __clear_irq_vector(irq, get_irq_chip_data(irq));
3311 spin_unlock_irqrestore(&vector_lock, flags); 3308 raw_spin_unlock_irqrestore(&vector_lock, flags);
3312} 3309}
3313 3310
3314/* 3311/*
@@ -3845,9 +3842,9 @@ int __init io_apic_get_redir_entries (int ioapic)
3845 union IO_APIC_reg_01 reg_01; 3842 union IO_APIC_reg_01 reg_01;
3846 unsigned long flags; 3843 unsigned long flags;
3847 3844
3848 spin_lock_irqsave(&ioapic_lock, flags); 3845 raw_spin_lock_irqsave(&ioapic_lock, flags);
3849 reg_01.raw = io_apic_read(ioapic, 1); 3846 reg_01.raw = io_apic_read(ioapic, 1);
3850 spin_unlock_irqrestore(&ioapic_lock, flags); 3847 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3851 3848
3852 return reg_01.bits.entries; 3849 return reg_01.bits.entries;
3853} 3850}
@@ -3930,7 +3927,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
3930 /* 3927 /*
3931 * IRQs < 16 are already in the irq_2_pin[] map 3928 * IRQs < 16 are already in the irq_2_pin[] map
3932 */ 3929 */
3933 if (irq >= nr_legacy_irqs) { 3930 if (irq >= legacy_pic->nr_legacy_irqs) {
3934 cfg = desc->chip_data; 3931 cfg = desc->chip_data;
3935 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { 3932 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
3936 printk(KERN_INFO "can not add pin %d for irq %d\n", 3933 printk(KERN_INFO "can not add pin %d for irq %d\n",
@@ -4009,9 +4006,9 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
4009 if (physids_empty(apic_id_map)) 4006 if (physids_empty(apic_id_map))
4010 apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); 4007 apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
4011 4008
4012 spin_lock_irqsave(&ioapic_lock, flags); 4009 raw_spin_lock_irqsave(&ioapic_lock, flags);
4013 reg_00.raw = io_apic_read(ioapic, 0); 4010 reg_00.raw = io_apic_read(ioapic, 0);
4014 spin_unlock_irqrestore(&ioapic_lock, flags); 4011 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
4015 4012
4016 if (apic_id >= get_physical_broadcast()) { 4013 if (apic_id >= get_physical_broadcast()) {
4017 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " 4014 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
@@ -4045,10 +4042,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
4045 if (reg_00.bits.ID != apic_id) { 4042 if (reg_00.bits.ID != apic_id) {
4046 reg_00.bits.ID = apic_id; 4043 reg_00.bits.ID = apic_id;
4047 4044
4048 spin_lock_irqsave(&ioapic_lock, flags); 4045 raw_spin_lock_irqsave(&ioapic_lock, flags);
4049 io_apic_write(ioapic, 0, reg_00.raw); 4046 io_apic_write(ioapic, 0, reg_00.raw);
4050 reg_00.raw = io_apic_read(ioapic, 0); 4047 reg_00.raw = io_apic_read(ioapic, 0);
4051 spin_unlock_irqrestore(&ioapic_lock, flags); 4048 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
4052 4049
4053 /* Sanity check */ 4050 /* Sanity check */
4054 if (reg_00.bits.ID != apic_id) { 4051 if (reg_00.bits.ID != apic_id) {
@@ -4069,9 +4066,9 @@ int __init io_apic_get_version(int ioapic)
4069 union IO_APIC_reg_01 reg_01; 4066 union IO_APIC_reg_01 reg_01;
4070 unsigned long flags; 4067 unsigned long flags;
4071 4068
4072 spin_lock_irqsave(&ioapic_lock, flags); 4069 raw_spin_lock_irqsave(&ioapic_lock, flags);
4073 reg_01.raw = io_apic_read(ioapic, 1); 4070 reg_01.raw = io_apic_read(ioapic, 1);
4074 spin_unlock_irqrestore(&ioapic_lock, flags); 4071 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
4075 4072
4076 return reg_01.bits.version; 4073 return reg_01.bits.version;
4077} 4074}
@@ -4103,27 +4100,23 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4103#ifdef CONFIG_SMP 4100#ifdef CONFIG_SMP
4104void __init setup_ioapic_dest(void) 4101void __init setup_ioapic_dest(void)
4105{ 4102{
4106 int pin, ioapic = 0, irq, irq_entry; 4103 int pin, ioapic, irq, irq_entry;
4107 struct irq_desc *desc; 4104 struct irq_desc *desc;
4108 const struct cpumask *mask; 4105 const struct cpumask *mask;
4109 4106
4110 if (skip_ioapic_setup == 1) 4107 if (skip_ioapic_setup == 1)
4111 return; 4108 return;
4112 4109
4113#ifdef CONFIG_ACPI 4110 for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
4114 if (!acpi_disabled && acpi_ioapic) {
4115 ioapic = mp_find_ioapic(0);
4116 if (ioapic < 0)
4117 ioapic = 0;
4118 }
4119#endif
4120
4121 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { 4111 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4122 irq_entry = find_irq_entry(ioapic, pin, mp_INT); 4112 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4123 if (irq_entry == -1) 4113 if (irq_entry == -1)
4124 continue; 4114 continue;
4125 irq = pin_2_irq(irq_entry, ioapic, pin); 4115 irq = pin_2_irq(irq_entry, ioapic, pin);
4126 4116
4117 if ((ioapic > 0) && (irq > 16))
4118 continue;
4119
4127 desc = irq_to_desc(irq); 4120 desc = irq_to_desc(irq);
4128 4121
4129 /* 4122 /*
@@ -4308,3 +4301,24 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4308 4301
4309 nr_ioapics++; 4302 nr_ioapics++;
4310} 4303}
4304
4305/* Enable IOAPIC early just for system timer */
4306void __init pre_init_apic_IRQ0(void)
4307{
4308 struct irq_cfg *cfg;
4309 struct irq_desc *desc;
4310
4311 printk(KERN_INFO "Early APIC setup for system timer0\n");
4312#ifndef CONFIG_SMP
4313 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
4314#endif
4315 desc = irq_to_desc_alloc_node(0, 0);
4316
4317 setup_local_APIC();
4318
4319 cfg = irq_cfg(0);
4320 add_pin_to_irq_node(cfg, 0, 0, 0);
4321 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
4322
4323 setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
4324}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 0159a69396cb..8aa65adbd25d 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -177,7 +177,7 @@ int __init check_nmi_watchdog(void)
177error: 177error:
178 if (nmi_watchdog == NMI_IO_APIC) { 178 if (nmi_watchdog == NMI_IO_APIC) {
179 if (!timer_through_8259) 179 if (!timer_through_8259)
180 disable_8259A_irq(0); 180 legacy_pic->chip->mask(0);
181 on_each_cpu(__acpi_nmi_disable, NULL, 1); 181 on_each_cpu(__acpi_nmi_disable, NULL, 1);
182 } 182 }
183 183
@@ -416,13 +416,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
416 416
417 /* We can be called before check_nmi_watchdog, hence NULL check. */ 417 /* We can be called before check_nmi_watchdog, hence NULL check. */
418 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { 418 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
419 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 419 static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
420 420
421 spin_lock(&lock); 421 raw_spin_lock(&lock);
422 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 422 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
423 show_regs(regs); 423 show_regs(regs);
424 dump_stack(); 424 dump_stack();
425 spin_unlock(&lock); 425 raw_spin_unlock(&lock);
426 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); 426 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
427 427
428 rc = 1; 428 rc = 1;
@@ -438,8 +438,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
438 * Ayiee, looks like this CPU is stuck ... 438 * Ayiee, looks like this CPU is stuck ...
439 * wait a few IRQs (5 seconds) before doing the oops ... 439 * wait a few IRQs (5 seconds) before doing the oops ...
440 */ 440 */
441 __this_cpu_inc(per_cpu_var(alert_counter)); 441 __this_cpu_inc(alert_counter);
442 if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz) 442 if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
443 /* 443 /*
444 * die_nmi will return ONLY if NOTIFY_STOP happens.. 444 * die_nmi will return ONLY if NOTIFY_STOP happens..
445 */ 445 */
@@ -447,7 +447,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
447 regs, panic_on_timeout); 447 regs, panic_on_timeout);
448 } else { 448 } else {
449 __get_cpu_var(last_irq_sum) = sum; 449 __get_cpu_var(last_irq_sum) = sum;
450 __this_cpu_write(per_cpu_var(alert_counter), 0); 450 __this_cpu_write(alert_counter, 0);
451 } 451 }
452 452
453 /* see if the nmi watchdog went off */ 453 /* see if the nmi watchdog went off */
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 98c4665f251c..3e28401f161c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -225,7 +225,7 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
225 225
226 mpc_record = 0; 226 mpc_record = 0;
227 printk(KERN_INFO 227 printk(KERN_INFO
228 "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); 228 "Found an OEM MPC table at %8p - parsing it...\n", oemtable);
229 229
230 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { 230 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
231 printk(KERN_WARNING 231 printk(KERN_WARNING
@@ -277,6 +277,7 @@ static __init void early_check_numaq(void)
277 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; 277 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
278 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; 278 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
279 x86_init.timers.tsc_pre_init = numaq_tsc_init; 279 x86_init.timers.tsc_pre_init = numaq_tsc_init;
280 x86_init.pci.init = pci_numaq_init;
280 } 281 }
281} 282}
282 283
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index af0ca80e38a9..49dbeaef2a27 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * SGI UV APIC functions (note: not an Intel compatible APIC) 6 * SGI UV APIC functions (note: not an Intel compatible APIC)
7 * 7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10#include <linux/cpumask.h> 10#include <linux/cpumask.h>
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
@@ -20,6 +20,8 @@
20#include <linux/cpu.h> 20#include <linux/cpu.h>
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/io.h> 22#include <linux/io.h>
23#include <linux/pci.h>
24#include <linux/kdebug.h>
23 25
24#include <asm/uv/uv_mmrs.h> 26#include <asm/uv/uv_mmrs.h>
25#include <asm/uv/uv_hub.h> 27#include <asm/uv/uv_hub.h>
@@ -34,10 +36,13 @@
34 36
35DEFINE_PER_CPU(int, x2apic_extra_bits); 37DEFINE_PER_CPU(int, x2apic_extra_bits);
36 38
39#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args)
40
37static enum uv_system_type uv_system_type; 41static enum uv_system_type uv_system_type;
38static u64 gru_start_paddr, gru_end_paddr; 42static u64 gru_start_paddr, gru_end_paddr;
39int uv_min_hub_revision_id; 43int uv_min_hub_revision_id;
40EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); 44EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
45static DEFINE_SPINLOCK(uv_nmi_lock);
41 46
42static inline bool is_GRU_range(u64 start, u64 end) 47static inline bool is_GRU_range(u64 start, u64 end)
43{ 48{
@@ -71,6 +76,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
71 if (!strcmp(oem_id, "SGI")) { 76 if (!strcmp(oem_id, "SGI")) {
72 nodeid = early_get_nodeid(); 77 nodeid = early_get_nodeid();
73 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; 78 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
79 x86_platform.nmi_init = uv_nmi_init;
74 if (!strcmp(oem_table_id, "UVL")) 80 if (!strcmp(oem_table_id, "UVL"))
75 uv_system_type = UV_LEGACY_APIC; 81 uv_system_type = UV_LEGACY_APIC;
76 else if (!strcmp(oem_table_id, "UVX")) 82 else if (!strcmp(oem_table_id, "UVX"))
@@ -480,7 +486,7 @@ static void uv_heartbeat(unsigned long ignored)
480 486
481static void __cpuinit uv_heartbeat_enable(int cpu) 487static void __cpuinit uv_heartbeat_enable(int cpu)
482{ 488{
483 if (!uv_cpu_hub_info(cpu)->scir.enabled) { 489 while (!uv_cpu_hub_info(cpu)->scir.enabled) {
484 struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; 490 struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
485 491
486 uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); 492 uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
@@ -488,11 +494,10 @@ static void __cpuinit uv_heartbeat_enable(int cpu)
488 timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; 494 timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
489 add_timer_on(timer, cpu); 495 add_timer_on(timer, cpu);
490 uv_cpu_hub_info(cpu)->scir.enabled = 1; 496 uv_cpu_hub_info(cpu)->scir.enabled = 1;
491 }
492 497
493 /* check boot cpu */ 498 /* also ensure that boot cpu is enabled */
494 if (!uv_cpu_hub_info(0)->scir.enabled) 499 cpu = 0;
495 uv_heartbeat_enable(0); 500 }
496} 501}
497 502
498#ifdef CONFIG_HOTPLUG_CPU 503#ifdef CONFIG_HOTPLUG_CPU
@@ -551,6 +556,30 @@ late_initcall(uv_init_heartbeat);
551 556
552#endif /* !CONFIG_HOTPLUG_CPU */ 557#endif /* !CONFIG_HOTPLUG_CPU */
553 558
559/* Direct Legacy VGA I/O traffic to designated IOH */
560int uv_set_vga_state(struct pci_dev *pdev, bool decode,
561 unsigned int command_bits, bool change_bridge)
562{
563 int domain, bus, rc;
564
565 PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n",
566 pdev->devfn, decode, command_bits, change_bridge);
567
568 if (!change_bridge)
569 return 0;
570
571 if ((command_bits & PCI_COMMAND_IO) == 0)
572 return 0;
573
574 domain = pci_domain_nr(pdev->bus);
575 bus = pdev->bus->number;
576
577 rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
578 PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc);
579
580 return rc;
581}
582
554/* 583/*
555 * Called on each cpu to initialize the per_cpu UV data area. 584 * Called on each cpu to initialize the per_cpu UV data area.
556 * FIXME: hotplug not supported yet 585 * FIXME: hotplug not supported yet
@@ -567,6 +596,46 @@ void __cpuinit uv_cpu_init(void)
567 set_x2apic_extra_bits(uv_hub_info->pnode); 596 set_x2apic_extra_bits(uv_hub_info->pnode);
568} 597}
569 598
599/*
600 * When NMI is received, print a stack trace.
601 */
602int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
603{
604 if (reason != DIE_NMI_IPI)
605 return NOTIFY_OK;
606 /*
607 * Use a lock so only one cpu prints at a time
608 * to prevent intermixed output.
609 */
610 spin_lock(&uv_nmi_lock);
611 pr_info("NMI stack dump cpu %u:\n", smp_processor_id());
612 dump_stack();
613 spin_unlock(&uv_nmi_lock);
614
615 return NOTIFY_STOP;
616}
617
618static struct notifier_block uv_dump_stack_nmi_nb = {
619 .notifier_call = uv_handle_nmi
620};
621
622void uv_register_nmi_notifier(void)
623{
624 if (register_die_notifier(&uv_dump_stack_nmi_nb))
625 printk(KERN_WARNING "UV NMI handler failed to register\n");
626}
627
628void uv_nmi_init(void)
629{
630 unsigned int value;
631
632 /*
633 * Unmask NMI on all cpus
634 */
635 value = apic_read(APIC_LVT1) | APIC_DM_NMI;
636 value &= ~APIC_LVT_MASKED;
637 apic_write(APIC_LVT1, value);
638}
570 639
571void __init uv_system_init(void) 640void __init uv_system_init(void)
572{ 641{
@@ -632,8 +701,8 @@ void __init uv_system_init(void)
632 } 701 }
633 702
634 uv_bios_init(); 703 uv_bios_init();
635 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, 704 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id,
636 &sn_coherency_id, &sn_region_size); 705 &sn_region_size, &system_serial_number);
637 uv_rtc_init(); 706 uv_rtc_init();
638 707
639 for_each_present_cpu(cpu) { 708 for_each_present_cpu(cpu) {
@@ -688,5 +757,9 @@ void __init uv_system_init(void)
688 757
689 uv_cpu_init(); 758 uv_cpu_init();
690 uv_scir_register_cpu_notifier(); 759 uv_scir_register_cpu_notifier();
760 uv_register_nmi_notifier();
691 proc_mkdir("sgi_uv", NULL); 761 proc_mkdir("sgi_uv", NULL);
762
763 /* register Legacy VGA I/O redirection handler */
764 pci_register_set_vga_state(uv_set_vga_state);
692} 765}
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index b5b6b23bce53..031aa887b0eb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1992,8 +1992,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
1992 apm_info.disabled = 1; 1992 apm_info.disabled = 1;
1993 printk(KERN_INFO "%s machine detected. " 1993 printk(KERN_INFO "%s machine detected. "
1994 "Disabling APM.\n", d->ident); 1994 "Disabling APM.\n", d->ident);
1995 printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); 1995 printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n");
1996 printk(KERN_INFO "download from support.intel.com \n"); 1996 printk(KERN_INFO "download from support.intel.com\n");
1997 } 1997 }
1998 return 0; 1998 return 0;
1999} 1999}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index b0206a211b09..8bc57baaa9ad 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -15,8 +15,8 @@
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 * 17 *
18 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. 18 * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Russ Anderson 19 * Copyright (c) Russ Anderson <rja@sgi.com>
20 */ 20 */
21 21
22#include <linux/efi.h> 22#include <linux/efi.h>
@@ -30,6 +30,7 @@ static struct uv_systab uv_systab;
30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) 30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
31{ 31{
32 struct uv_systab *tab = &uv_systab; 32 struct uv_systab *tab = &uv_systab;
33 s64 ret;
33 34
34 if (!tab->function) 35 if (!tab->function)
35 /* 36 /*
@@ -37,9 +38,11 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
37 */ 38 */
38 return BIOS_STATUS_UNIMPLEMENTED; 39 return BIOS_STATUS_UNIMPLEMENTED;
39 40
40 return efi_call6((void *)__va(tab->function), 41 ret = efi_call6((void *)__va(tab->function), (u64)which,
41 (u64)which, a1, a2, a3, a4, a5); 42 a1, a2, a3, a4, a5);
43 return ret;
42} 44}
45EXPORT_SYMBOL_GPL(uv_bios_call);
43 46
44s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, 47s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
45 u64 a4, u64 a5) 48 u64 a4, u64 a5)
@@ -73,11 +76,14 @@ long sn_coherency_id;
73EXPORT_SYMBOL_GPL(sn_coherency_id); 76EXPORT_SYMBOL_GPL(sn_coherency_id);
74long sn_region_size; 77long sn_region_size;
75EXPORT_SYMBOL_GPL(sn_region_size); 78EXPORT_SYMBOL_GPL(sn_region_size);
79long system_serial_number;
80EXPORT_SYMBOL_GPL(system_serial_number);
76int uv_type; 81int uv_type;
82EXPORT_SYMBOL_GPL(uv_type);
77 83
78 84
79s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, 85s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
80 long *region) 86 long *region, long *ssn)
81{ 87{
82 s64 ret; 88 s64 ret;
83 u64 v0, v1; 89 u64 v0, v1;
@@ -97,8 +103,11 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
97 *coher = part.coherence_id; 103 *coher = part.coherence_id;
98 if (region) 104 if (region)
99 *region = part.region_size; 105 *region = part.region_size;
106 if (ssn)
107 *ssn = v1;
100 return ret; 108 return ret;
101} 109}
110EXPORT_SYMBOL_GPL(uv_bios_get_sn_info);
102 111
103int 112int
104uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size, 113uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
@@ -154,6 +163,25 @@ s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
154} 163}
155EXPORT_SYMBOL_GPL(uv_bios_freq_base); 164EXPORT_SYMBOL_GPL(uv_bios_freq_base);
156 165
166/*
167 * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target
168 * @decode: true to enable target, false to disable target
169 * @domain: PCI domain number
170 * @bus: PCI bus number
171 *
172 * Returns:
173 * 0: Success
174 * -EINVAL: Invalid domain or bus number
175 * -ENOSYS: Capability not available
176 * -EBUSY: Legacy VGA I/O cannot be retargeted at this time
177 */
178int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
179{
180 return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET,
181 (u64)decode, (u64)domain, (u64)bus, 0, 0);
182}
183EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
184
157 185
158#ifdef CONFIG_EFI 186#ifdef CONFIG_EFI
159void uv_bios_init(void) 187void uv_bios_init(void)
@@ -185,4 +213,3 @@ void uv_bios_init(void)
185 213
186void uv_bios_init(void) { } 214void uv_bios_init(void) { }
187#endif 215#endif
188
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 468489b57aae..97ad79cdf688 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -32,6 +32,10 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { 32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, 33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, 34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
35 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
36 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
37 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
38 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
35 { 0, 0, 0, 0 } 39 { 0, 0, 0, 0 }
36 }; 40 };
37 41
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index f138c6c389b9..870e6cc6ad28 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -10,6 +10,20 @@ if CPU_FREQ
10 10
11comment "CPUFreq processor drivers" 11comment "CPUFreq processor drivers"
12 12
13config X86_PCC_CPUFREQ
14 tristate "Processor Clocking Control interface driver"
15 depends on ACPI && ACPI_PROCESSOR
16 help
17 This driver adds support for the PCC interface.
18
19 For details, take a look at:
20 <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
21
22 To compile this driver as a module, choose M here: the
23 module will be called pcc-cpufreq.
24
25 If in doubt, say N.
26
13config X86_ACPI_CPUFREQ 27config X86_ACPI_CPUFREQ
14 tristate "ACPI Processor P-States driver" 28 tristate "ACPI Processor P-States driver"
15 select CPU_FREQ_TABLE 29 select CPU_FREQ_TABLE
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 509296df294d..1840c0a5170b 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -4,6 +4,7 @@
4 4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o 5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o 6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
7obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o 8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
8obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
9obj-$(CONFIG_X86_LONGHAUL) += longhaul.o 10obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
new file mode 100644
index 000000000000..ff36d2979a90
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -0,0 +1,620 @@
1/*
2 * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
3 *
4 * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
5 * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
6 * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; version 2 of the License.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
17 * INFRINGEMENT. See the GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26#include <linux/kernel.h>
27#include <linux/module.h>
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/sched.h>
31#include <linux/cpufreq.h>
32#include <linux/compiler.h>
33
34#include <linux/acpi.h>
35#include <linux/io.h>
36#include <linux/spinlock.h>
37#include <linux/uaccess.h>
38
39#include <acpi/processor.h>
40
41#define PCC_VERSION "1.00.00"
42#define POLL_LOOPS 300
43
44#define CMD_COMPLETE 0x1
45#define CMD_GET_FREQ 0x0
46#define CMD_SET_FREQ 0x1
47
48#define BUF_SZ 4
49
50#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
51 "pcc-cpufreq", msg)
52
53struct pcc_register_resource {
54 u8 descriptor;
55 u16 length;
56 u8 space_id;
57 u8 bit_width;
58 u8 bit_offset;
59 u8 access_size;
60 u64 address;
61} __attribute__ ((packed));
62
63struct pcc_memory_resource {
64 u8 descriptor;
65 u16 length;
66 u8 space_id;
67 u8 resource_usage;
68 u8 type_specific;
69 u64 granularity;
70 u64 minimum;
71 u64 maximum;
72 u64 translation_offset;
73 u64 address_length;
74} __attribute__ ((packed));
75
76static struct cpufreq_driver pcc_cpufreq_driver;
77
78struct pcc_header {
79 u32 signature;
80 u16 length;
81 u8 major;
82 u8 minor;
83 u32 features;
84 u16 command;
85 u16 status;
86 u32 latency;
87 u32 minimum_time;
88 u32 maximum_time;
89 u32 nominal;
90 u32 throttled_frequency;
91 u32 minimum_frequency;
92};
93
94static void __iomem *pcch_virt_addr;
95static struct pcc_header __iomem *pcch_hdr;
96
97static DEFINE_SPINLOCK(pcc_lock);
98
99static struct acpi_generic_address doorbell;
100
101static u64 doorbell_preserve;
102static u64 doorbell_write;
103
104static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
105 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
106
107struct pcc_cpu {
108 u32 input_offset;
109 u32 output_offset;
110};
111
112static struct pcc_cpu *pcc_cpu_info;
113
114static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
115{
116 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
117 policy->cpuinfo.max_freq);
118 return 0;
119}
120
121static inline void pcc_cmd(void)
122{
123 u64 doorbell_value;
124 int i;
125
126 acpi_read(&doorbell_value, &doorbell);
127 acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
128 &doorbell);
129
130 for (i = 0; i < POLL_LOOPS; i++) {
131 if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
132 break;
133 }
134}
135
136static inline void pcc_clear_mapping(void)
137{
138 if (pcch_virt_addr)
139 iounmap(pcch_virt_addr);
140 pcch_virt_addr = NULL;
141}
142
143static unsigned int pcc_get_freq(unsigned int cpu)
144{
145 struct pcc_cpu *pcc_cpu_data;
146 unsigned int curr_freq;
147 unsigned int freq_limit;
148 u16 status;
149 u32 input_buffer;
150 u32 output_buffer;
151
152 spin_lock(&pcc_lock);
153
154 dprintk("get: get_freq for CPU %d\n", cpu);
155 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
156
157 input_buffer = 0x1;
158 iowrite32(input_buffer,
159 (pcch_virt_addr + pcc_cpu_data->input_offset));
160 iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
161
162 pcc_cmd();
163
164 output_buffer =
165 ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
166
167 /* Clear the input buffer - we are done with the current command */
168 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
169
170 status = ioread16(&pcch_hdr->status);
171 if (status != CMD_COMPLETE) {
172 dprintk("get: FAILED: for CPU %d, status is %d\n",
173 cpu, status);
174 goto cmd_incomplete;
175 }
176 iowrite16(0, &pcch_hdr->status);
177 curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
178 / 100) * 1000);
179
180 dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
181 "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
182 cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
183 output_buffer, curr_freq);
184
185 freq_limit = (output_buffer >> 8) & 0xff;
186 if (freq_limit != 0xff) {
187 dprintk("get: frequency for cpu %d is being temporarily"
188 " capped at %d\n", cpu, curr_freq);
189 }
190
191 spin_unlock(&pcc_lock);
192 return curr_freq;
193
194cmd_incomplete:
195 iowrite16(0, &pcch_hdr->status);
196 spin_unlock(&pcc_lock);
197 return -EINVAL;
198}
199
200static int pcc_cpufreq_target(struct cpufreq_policy *policy,
201 unsigned int target_freq,
202 unsigned int relation)
203{
204 struct pcc_cpu *pcc_cpu_data;
205 struct cpufreq_freqs freqs;
206 u16 status;
207 u32 input_buffer;
208 int cpu;
209
210 spin_lock(&pcc_lock);
211 cpu = policy->cpu;
212 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
213
214 dprintk("target: CPU %d should go to target freq: %d "
215 "(virtual) input_offset is 0x%x\n",
216 cpu, target_freq,
217 (pcch_virt_addr + pcc_cpu_data->input_offset));
218
219 freqs.new = target_freq;
220 freqs.cpu = cpu;
221 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
222
223 input_buffer = 0x1 | (((target_freq * 100)
224 / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
225 iowrite32(input_buffer,
226 (pcch_virt_addr + pcc_cpu_data->input_offset));
227 iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
228
229 pcc_cmd();
230
231 /* Clear the input buffer - we are done with the current command */
232 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
233
234 status = ioread16(&pcch_hdr->status);
235 if (status != CMD_COMPLETE) {
236 dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
237 cpu, status);
238 goto cmd_incomplete;
239 }
240 iowrite16(0, &pcch_hdr->status);
241
242 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
243 dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
244 spin_unlock(&pcc_lock);
245
246 return 0;
247
248cmd_incomplete:
249 iowrite16(0, &pcch_hdr->status);
250 spin_unlock(&pcc_lock);
251 return -EINVAL;
252}
253
254static int pcc_get_offset(int cpu)
255{
256 acpi_status status;
257 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
258 union acpi_object *pccp, *offset;
259 struct pcc_cpu *pcc_cpu_data;
260 struct acpi_processor *pr;
261 int ret = 0;
262
263 pr = per_cpu(processors, cpu);
264 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
265
266 status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
267 if (ACPI_FAILURE(status))
268 return -ENODEV;
269
270 pccp = buffer.pointer;
271 if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
272 ret = -ENODEV;
273 goto out_free;
274 };
275
276 offset = &(pccp->package.elements[0]);
277 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
278 ret = -ENODEV;
279 goto out_free;
280 }
281
282 pcc_cpu_data->input_offset = offset->integer.value;
283
284 offset = &(pccp->package.elements[1]);
285 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
286 ret = -ENODEV;
287 goto out_free;
288 }
289
290 pcc_cpu_data->output_offset = offset->integer.value;
291
292 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
293 memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
294
295 dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
296 "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
297 cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
298out_free:
299 kfree(buffer.pointer);
300 return ret;
301}
302
303static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
304{
305 acpi_status status;
306 struct acpi_object_list input;
307 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
308 union acpi_object in_params[4];
309 union acpi_object *out_obj;
310 u32 capabilities[2];
311 u32 errors;
312 u32 supported;
313 int ret = 0;
314
315 input.count = 4;
316 input.pointer = in_params;
317 input.count = 4;
318 input.pointer = in_params;
319 in_params[0].type = ACPI_TYPE_BUFFER;
320 in_params[0].buffer.length = 16;
321 in_params[0].buffer.pointer = OSC_UUID;
322 in_params[1].type = ACPI_TYPE_INTEGER;
323 in_params[1].integer.value = 1;
324 in_params[2].type = ACPI_TYPE_INTEGER;
325 in_params[2].integer.value = 2;
326 in_params[3].type = ACPI_TYPE_BUFFER;
327 in_params[3].buffer.length = 8;
328 in_params[3].buffer.pointer = (u8 *)&capabilities;
329
330 capabilities[0] = OSC_QUERY_ENABLE;
331 capabilities[1] = 0x1;
332
333 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
334 if (ACPI_FAILURE(status))
335 return -ENODEV;
336
337 if (!output.length)
338 return -ENODEV;
339
340 out_obj = output.pointer;
341 if (out_obj->type != ACPI_TYPE_BUFFER) {
342 ret = -ENODEV;
343 goto out_free;
344 }
345
346 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
347 if (errors) {
348 ret = -ENODEV;
349 goto out_free;
350 }
351
352 supported = *((u32 *)(out_obj->buffer.pointer + 4));
353 if (!(supported & 0x1)) {
354 ret = -ENODEV;
355 goto out_free;
356 }
357
358 kfree(output.pointer);
359 capabilities[0] = 0x0;
360 capabilities[1] = 0x1;
361
362 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
363 if (ACPI_FAILURE(status))
364 return -ENODEV;
365
366 if (!output.length)
367 return -ENODEV;
368
369 out_obj = output.pointer;
370 if (out_obj->type != ACPI_TYPE_BUFFER) {
371 ret = -ENODEV;
372 goto out_free;
373 }
374
375 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
376 if (errors) {
377 ret = -ENODEV;
378 goto out_free;
379 }
380
381 supported = *((u32 *)(out_obj->buffer.pointer + 4));
382 if (!(supported & 0x1)) {
383 ret = -ENODEV;
384 goto out_free;
385 }
386
387out_free:
388 kfree(output.pointer);
389 return ret;
390}
391
392static int __init pcc_cpufreq_probe(void)
393{
394 acpi_status status;
395 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
396 struct pcc_memory_resource *mem_resource;
397 struct pcc_register_resource *reg_resource;
398 union acpi_object *out_obj, *member;
399 acpi_handle handle, osc_handle;
400 int ret = 0;
401
402 status = acpi_get_handle(NULL, "\\_SB", &handle);
403 if (ACPI_FAILURE(status))
404 return -ENODEV;
405
406 status = acpi_get_handle(handle, "_OSC", &osc_handle);
407 if (ACPI_SUCCESS(status)) {
408 ret = pcc_cpufreq_do_osc(&osc_handle);
409 if (ret)
410 dprintk("probe: _OSC evaluation did not succeed\n");
411 /* Firmware's use of _OSC is optional */
412 ret = 0;
413 }
414
415 status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
416 if (ACPI_FAILURE(status))
417 return -ENODEV;
418
419 out_obj = output.pointer;
420 if (out_obj->type != ACPI_TYPE_PACKAGE) {
421 ret = -ENODEV;
422 goto out_free;
423 }
424
425 member = &out_obj->package.elements[0];
426 if (member->type != ACPI_TYPE_BUFFER) {
427 ret = -ENODEV;
428 goto out_free;
429 }
430
431 mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
432
433 dprintk("probe: mem_resource descriptor: 0x%x,"
434 " length: %d, space_id: %d, resource_usage: %d,"
435 " type_specific: %d, granularity: 0x%llx,"
436 " minimum: 0x%llx, maximum: 0x%llx,"
437 " translation_offset: 0x%llx, address_length: 0x%llx\n",
438 mem_resource->descriptor, mem_resource->length,
439 mem_resource->space_id, mem_resource->resource_usage,
440 mem_resource->type_specific, mem_resource->granularity,
441 mem_resource->minimum, mem_resource->maximum,
442 mem_resource->translation_offset,
443 mem_resource->address_length);
444
445 if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
446 ret = -ENODEV;
447 goto out_free;
448 }
449
450 pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
451 mem_resource->address_length);
452 if (pcch_virt_addr == NULL) {
453 dprintk("probe: could not map shared mem region\n");
454 goto out_free;
455 }
456 pcch_hdr = pcch_virt_addr;
457
458 dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
459 dprintk("probe: PCCH header is at physical address: 0x%llx,"
460 " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
461 " supported features: 0x%x, command field: 0x%x,"
462 " status field: 0x%x, nominal latency: %d us\n",
463 mem_resource->minimum, ioread32(&pcch_hdr->signature),
464 ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
465 ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
466 ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
467 ioread32(&pcch_hdr->latency));
468
469 dprintk("probe: min time between commands: %d us,"
470 " max time between commands: %d us,"
471 " nominal CPU frequency: %d MHz,"
472 " minimum CPU frequency: %d MHz,"
473 " minimum CPU frequency without throttling: %d MHz\n",
474 ioread32(&pcch_hdr->minimum_time),
475 ioread32(&pcch_hdr->maximum_time),
476 ioread32(&pcch_hdr->nominal),
477 ioread32(&pcch_hdr->throttled_frequency),
478 ioread32(&pcch_hdr->minimum_frequency));
479
480 member = &out_obj->package.elements[1];
481 if (member->type != ACPI_TYPE_BUFFER) {
482 ret = -ENODEV;
483 goto pcch_free;
484 }
485
486 reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
487
488 doorbell.space_id = reg_resource->space_id;
489 doorbell.bit_width = reg_resource->bit_width;
490 doorbell.bit_offset = reg_resource->bit_offset;
491 doorbell.access_width = 64;
492 doorbell.address = reg_resource->address;
493
494 dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
495 "bit_offset is %d, access_width is %d, address is 0x%llx\n",
496 doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
497 doorbell.access_width, reg_resource->address);
498
499 member = &out_obj->package.elements[2];
500 if (member->type != ACPI_TYPE_INTEGER) {
501 ret = -ENODEV;
502 goto pcch_free;
503 }
504
505 doorbell_preserve = member->integer.value;
506
507 member = &out_obj->package.elements[3];
508 if (member->type != ACPI_TYPE_INTEGER) {
509 ret = -ENODEV;
510 goto pcch_free;
511 }
512
513 doorbell_write = member->integer.value;
514
515 dprintk("probe: doorbell_preserve: 0x%llx,"
516 " doorbell_write: 0x%llx\n",
517 doorbell_preserve, doorbell_write);
518
519 pcc_cpu_info = alloc_percpu(struct pcc_cpu);
520 if (!pcc_cpu_info) {
521 ret = -ENOMEM;
522 goto pcch_free;
523 }
524
525 printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
526 " limits: %d MHz, %d MHz\n", PCC_VERSION,
527 ioread32(&pcch_hdr->minimum_frequency),
528 ioread32(&pcch_hdr->nominal));
529 kfree(output.pointer);
530 return ret;
531pcch_free:
532 pcc_clear_mapping();
533out_free:
534 kfree(output.pointer);
535 return ret;
536}
537
538static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
539{
540 unsigned int cpu = policy->cpu;
541 unsigned int result = 0;
542
543 if (!pcch_virt_addr) {
544 result = -1;
545 goto pcch_null;
546 }
547
548 result = pcc_get_offset(cpu);
549 if (result) {
550 dprintk("init: PCCP evaluation failed\n");
551 goto free;
552 }
553
554 policy->max = policy->cpuinfo.max_freq =
555 ioread32(&pcch_hdr->nominal) * 1000;
556 policy->min = policy->cpuinfo.min_freq =
557 ioread32(&pcch_hdr->minimum_frequency) * 1000;
558 policy->cur = pcc_get_freq(cpu);
559
560 dprintk("init: policy->max is %d, policy->min is %d\n",
561 policy->max, policy->min);
562
563 return 0;
564free:
565 pcc_clear_mapping();
566 free_percpu(pcc_cpu_info);
567pcch_null:
568 return result;
569}
570
571static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
572{
573 return 0;
574}
575
576static struct cpufreq_driver pcc_cpufreq_driver = {
577 .flags = CPUFREQ_CONST_LOOPS,
578 .get = pcc_get_freq,
579 .verify = pcc_cpufreq_verify,
580 .target = pcc_cpufreq_target,
581 .init = pcc_cpufreq_cpu_init,
582 .exit = pcc_cpufreq_cpu_exit,
583 .name = "pcc-cpufreq",
584 .owner = THIS_MODULE,
585};
586
587static int __init pcc_cpufreq_init(void)
588{
589 int ret;
590
591 if (acpi_disabled)
592 return 0;
593
594 ret = pcc_cpufreq_probe();
595 if (ret) {
596 dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
597 return ret;
598 }
599
600 ret = cpufreq_register_driver(&pcc_cpufreq_driver);
601
602 return ret;
603}
604
605static void __exit pcc_cpufreq_exit(void)
606{
607 cpufreq_unregister_driver(&pcc_cpufreq_driver);
608
609 pcc_clear_mapping();
610
611 free_percpu(pcc_cpu_info);
612}
613
614MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
615MODULE_VERSION(PCC_VERSION);
616MODULE_DESCRIPTION("Processor Clocking Control interface driver");
617MODULE_LICENSE("GPL");
618
619late_initcall(pcc_cpufreq_init);
620module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index f125e5c551c0..d360b56e9825 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -806,7 +806,7 @@ static int find_psb_table(struct powernow_k8_data *data)
806static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, 806static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
807 unsigned int index) 807 unsigned int index)
808{ 808{
809 acpi_integer control; 809 u64 control;
810 810
811 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) 811 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
812 return; 812 return;
@@ -824,7 +824,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
824{ 824{
825 struct cpufreq_frequency_table *powernow_table; 825 struct cpufreq_frequency_table *powernow_table;
826 int ret_val = -ENODEV; 826 int ret_val = -ENODEV;
827 acpi_integer control, status; 827 u64 control, status;
828 828
829 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 829 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
830 dprintk("register performance failed: bad ACPI data\n"); 830 dprintk("register performance failed: bad ACPI data\n");
@@ -948,7 +948,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
948 u32 fid; 948 u32 fid;
949 u32 vid; 949 u32 vid;
950 u32 freq, index; 950 u32 freq, index;
951 acpi_integer status, control; 951 u64 status, control;
952 952
953 if (data->exttype) { 953 if (data->exttype) {
954 status = data->acpi_data.states[i].status; 954 status = data->acpi_data.states[i].status;
@@ -1356,6 +1356,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1356 1356
1357 kfree(data->powernow_table); 1357 kfree(data->powernow_table);
1358 kfree(data); 1358 kfree(data);
1359 per_cpu(powernow_data, pol->cpu) = NULL;
1359 1360
1360 return 0; 1361 return 0;
1361} 1362}
@@ -1375,7 +1376,7 @@ static unsigned int powernowk8_get(unsigned int cpu)
1375 int err; 1376 int err;
1376 1377
1377 if (!data) 1378 if (!data)
1378 return -EINVAL; 1379 return 0;
1379 1380
1380 smp_call_function_single(cpu, query_values_on_cpu, &err, true); 1381 smp_call_function_single(cpu, query_values_on_cpu, &err, true);
1381 if (err) 1382 if (err)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index fc6c8ef92dcc..b3eeb66c0a51 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -18,6 +18,7 @@
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <asm/k8.h> 20#include <asm/k8.h>
21#include <asm/smp.h>
21 22
22#define LVL_1_INST 1 23#define LVL_1_INST 1
23#define LVL_1_DATA 2 24#define LVL_1_DATA 2
@@ -31,6 +32,8 @@ struct _cache_table {
31 short size; 32 short size;
32}; 33};
33 34
35#define MB(x) ((x) * 1024)
36
34/* All the cache descriptor types we care about (no TLB or 37/* All the cache descriptor types we care about (no TLB or
35 trace cache entries) */ 38 trace cache entries) */
36 39
@@ -44,9 +47,9 @@ static const struct _cache_table __cpuinitconst cache_table[] =
44 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ 47 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
45 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ 48 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
46 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 49 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
47 { 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 50 { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
48 { 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 51 { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */
49 { 0x29, LVL_3, 4096 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 52 { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */
50 { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ 53 { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */
51 { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ 54 { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */
52 { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 55 { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */
@@ -59,16 +62,16 @@ static const struct _cache_table __cpuinitconst cache_table[] =
59 { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ 62 { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */
60 { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ 63 { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */
61 { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ 64 { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */
62 { 0x44, LVL_2, 1024 }, /* 4-way set assoc, 32 byte line size */ 65 { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */
63 { 0x45, LVL_2, 2048 }, /* 4-way set assoc, 32 byte line size */ 66 { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
64 { 0x46, LVL_3, 4096 }, /* 4-way set assoc, 64 byte line size */ 67 { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
65 { 0x47, LVL_3, 8192 }, /* 8-way set assoc, 64 byte line size */ 68 { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
66 { 0x49, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ 69 { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
67 { 0x4a, LVL_3, 6144 }, /* 12-way set assoc, 64 byte line size */ 70 { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
68 { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 71 { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
69 { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */ 72 { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */
70 { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */ 73 { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */
71 { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */ 74 { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */
72 { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 75 { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
73 { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 76 { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
74 { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 77 { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
@@ -77,34 +80,34 @@ static const struct _cache_table __cpuinitconst cache_table[] =
77 { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ 80 { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
78 { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ 81 { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
79 { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ 82 { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */
80 { 0x78, LVL_2, 1024 }, /* 4-way set assoc, 64 byte line size */ 83 { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */
81 { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 84 { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
82 { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 85 { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
83 { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 86 { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */
84 { 0x7c, LVL_2, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 87 { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
85 { 0x7d, LVL_2, 2048 }, /* 8-way set assoc, 64 byte line size */ 88 { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
86 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ 89 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
87 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ 90 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
88 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ 91 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
89 { 0x84, LVL_2, 1024 }, /* 8-way set assoc, 32 byte line size */ 92 { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
90 { 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */ 93 { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */
91 { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ 94 { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
92 { 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */ 95 { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */
93 { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ 96 { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */
94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ 97 { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */
95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ 98 { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */
96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ 99 { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */
97 { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */ 100 { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */
98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 101 { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ 102 { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */
100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 103 { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
101 { 0xde, LVL_3, 8192 }, /* 12-way set assoc, 64 byte line size */ 104 { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */
102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ 105 { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */
103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ 106 { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 107 { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
105 { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */ 108 { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */
106 { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */ 109 { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */
107 { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */ 110 { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */
108 { 0x00, 0, 0} 111 { 0x00, 0, 0}
109}; 112};
110 113
@@ -150,7 +153,8 @@ struct _cpuid4_info {
150 union _cpuid4_leaf_ebx ebx; 153 union _cpuid4_leaf_ebx ebx;
151 union _cpuid4_leaf_ecx ecx; 154 union _cpuid4_leaf_ecx ecx;
152 unsigned long size; 155 unsigned long size;
153 unsigned long can_disable; 156 bool can_disable;
157 unsigned int l3_indices;
154 DECLARE_BITMAP(shared_cpu_map, NR_CPUS); 158 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
155}; 159};
156 160
@@ -160,7 +164,8 @@ struct _cpuid4_info_regs {
160 union _cpuid4_leaf_ebx ebx; 164 union _cpuid4_leaf_ebx ebx;
161 union _cpuid4_leaf_ecx ecx; 165 union _cpuid4_leaf_ecx ecx;
162 unsigned long size; 166 unsigned long size;
163 unsigned long can_disable; 167 bool can_disable;
168 unsigned int l3_indices;
164}; 169};
165 170
166unsigned short num_cache_leaves; 171unsigned short num_cache_leaves;
@@ -290,6 +295,36 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
290 (ebx->split.ways_of_associativity + 1) - 1; 295 (ebx->split.ways_of_associativity + 1) - 1;
291} 296}
292 297
298struct _cache_attr {
299 struct attribute attr;
300 ssize_t (*show)(struct _cpuid4_info *, char *);
301 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
302};
303
304#ifdef CONFIG_CPU_SUP_AMD
305static unsigned int __cpuinit amd_calc_l3_indices(void)
306{
307 /*
308 * We're called over smp_call_function_single() and therefore
309 * are on the correct cpu.
310 */
311 int cpu = smp_processor_id();
312 int node = cpu_to_node(cpu);
313 struct pci_dev *dev = node_to_k8_nb_misc(node);
314 unsigned int sc0, sc1, sc2, sc3;
315 u32 val = 0;
316
317 pci_read_config_dword(dev, 0x1C4, &val);
318
319 /* calculate subcache sizes */
320 sc0 = !(val & BIT(0));
321 sc1 = !(val & BIT(4));
322 sc2 = !(val & BIT(8)) + !(val & BIT(9));
323 sc3 = !(val & BIT(12)) + !(val & BIT(13));
324
325 return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
326}
327
293static void __cpuinit 328static void __cpuinit
294amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 329amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
295{ 330{
@@ -299,12 +334,103 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
299 if (boot_cpu_data.x86 == 0x11) 334 if (boot_cpu_data.x86 == 0x11)
300 return; 335 return;
301 336
302 /* see erratum #382 */ 337 /* see errata #382 and #388 */
303 if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) 338 if ((boot_cpu_data.x86 == 0x10) &&
339 ((boot_cpu_data.x86_model < 0x8) ||
340 (boot_cpu_data.x86_mask < 0x1)))
304 return; 341 return;
305 342
306 this_leaf->can_disable = 1; 343 this_leaf->can_disable = true;
344 this_leaf->l3_indices = amd_calc_l3_indices();
345}
346
347static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
348 unsigned int index)
349{
350 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
351 int node = amd_get_nb_id(cpu);
352 struct pci_dev *dev = node_to_k8_nb_misc(node);
353 unsigned int reg = 0;
354
355 if (!this_leaf->can_disable)
356 return -EINVAL;
357
358 if (!dev)
359 return -EINVAL;
360
361 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
362 return sprintf(buf, "0x%08x\n", reg);
363}
364
365#define SHOW_CACHE_DISABLE(index) \
366static ssize_t \
367show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
368{ \
369 return show_cache_disable(this_leaf, buf, index); \
307} 370}
371SHOW_CACHE_DISABLE(0)
372SHOW_CACHE_DISABLE(1)
373
374static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
375 const char *buf, size_t count, unsigned int index)
376{
377 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
378 int node = amd_get_nb_id(cpu);
379 struct pci_dev *dev = node_to_k8_nb_misc(node);
380 unsigned long val = 0;
381
382#define SUBCACHE_MASK (3UL << 20)
383#define SUBCACHE_INDEX 0xfff
384
385 if (!this_leaf->can_disable)
386 return -EINVAL;
387
388 if (!capable(CAP_SYS_ADMIN))
389 return -EPERM;
390
391 if (!dev)
392 return -EINVAL;
393
394 if (strict_strtoul(buf, 10, &val) < 0)
395 return -EINVAL;
396
397 /* do not allow writes outside of allowed bits */
398 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
399 ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
400 return -EINVAL;
401
402 val |= BIT(30);
403 pci_write_config_dword(dev, 0x1BC + index * 4, val);
404 /*
405 * We need to WBINVD on a core on the node containing the L3 cache which
406 * indices we disable therefore a simple wbinvd() is not sufficient.
407 */
408 wbinvd_on_cpu(cpu);
409 pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
410 return count;
411}
412
413#define STORE_CACHE_DISABLE(index) \
414static ssize_t \
415store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
416 const char *buf, size_t count) \
417{ \
418 return store_cache_disable(this_leaf, buf, count, index); \
419}
420STORE_CACHE_DISABLE(0)
421STORE_CACHE_DISABLE(1)
422
423static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
424 show_cache_disable_0, store_cache_disable_0);
425static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
426 show_cache_disable_1, store_cache_disable_1);
427
428#else /* CONFIG_CPU_SUP_AMD */
429static void __cpuinit
430amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
431{
432};
433#endif /* CONFIG_CPU_SUP_AMD */
308 434
309static int 435static int
310__cpuinit cpuid4_cache_lookup_regs(int index, 436__cpuinit cpuid4_cache_lookup_regs(int index,
@@ -711,82 +837,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
711#define to_object(k) container_of(k, struct _index_kobject, kobj) 837#define to_object(k) container_of(k, struct _index_kobject, kobj)
712#define to_attr(a) container_of(a, struct _cache_attr, attr) 838#define to_attr(a) container_of(a, struct _cache_attr, attr)
713 839
714static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
715 unsigned int index)
716{
717 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
718 int node = cpu_to_node(cpu);
719 struct pci_dev *dev = node_to_k8_nb_misc(node);
720 unsigned int reg = 0;
721
722 if (!this_leaf->can_disable)
723 return -EINVAL;
724
725 if (!dev)
726 return -EINVAL;
727
728 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
729 return sprintf(buf, "%x\n", reg);
730}
731
732#define SHOW_CACHE_DISABLE(index) \
733static ssize_t \
734show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
735{ \
736 return show_cache_disable(this_leaf, buf, index); \
737}
738SHOW_CACHE_DISABLE(0)
739SHOW_CACHE_DISABLE(1)
740
741static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
742 const char *buf, size_t count, unsigned int index)
743{
744 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
745 int node = cpu_to_node(cpu);
746 struct pci_dev *dev = node_to_k8_nb_misc(node);
747 unsigned long val = 0;
748 unsigned int scrubber = 0;
749
750 if (!this_leaf->can_disable)
751 return -EINVAL;
752
753 if (!capable(CAP_SYS_ADMIN))
754 return -EPERM;
755
756 if (!dev)
757 return -EINVAL;
758
759 if (strict_strtoul(buf, 10, &val) < 0)
760 return -EINVAL;
761
762 val |= 0xc0000000;
763
764 pci_read_config_dword(dev, 0x58, &scrubber);
765 scrubber &= ~0x1f000000;
766 pci_write_config_dword(dev, 0x58, scrubber);
767
768 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
769 wbinvd();
770 pci_write_config_dword(dev, 0x1BC + index * 4, val);
771 return count;
772}
773
774#define STORE_CACHE_DISABLE(index) \
775static ssize_t \
776store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
777 const char *buf, size_t count) \
778{ \
779 return store_cache_disable(this_leaf, buf, count, index); \
780}
781STORE_CACHE_DISABLE(0)
782STORE_CACHE_DISABLE(1)
783
784struct _cache_attr {
785 struct attribute attr;
786 ssize_t (*show)(struct _cpuid4_info *, char *);
787 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
788};
789
790#define define_one_ro(_name) \ 840#define define_one_ro(_name) \
791static struct _cache_attr _name = \ 841static struct _cache_attr _name = \
792 __ATTR(_name, 0444, show_##_name, NULL) 842 __ATTR(_name, 0444, show_##_name, NULL)
@@ -801,23 +851,28 @@ define_one_ro(size);
801define_one_ro(shared_cpu_map); 851define_one_ro(shared_cpu_map);
802define_one_ro(shared_cpu_list); 852define_one_ro(shared_cpu_list);
803 853
804static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, 854#define DEFAULT_SYSFS_CACHE_ATTRS \
805 show_cache_disable_0, store_cache_disable_0); 855 &type.attr, \
806static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, 856 &level.attr, \
807 show_cache_disable_1, store_cache_disable_1); 857 &coherency_line_size.attr, \
858 &physical_line_partition.attr, \
859 &ways_of_associativity.attr, \
860 &number_of_sets.attr, \
861 &size.attr, \
862 &shared_cpu_map.attr, \
863 &shared_cpu_list.attr
808 864
809static struct attribute *default_attrs[] = { 865static struct attribute *default_attrs[] = {
810 &type.attr, 866 DEFAULT_SYSFS_CACHE_ATTRS,
811 &level.attr, 867 NULL
812 &coherency_line_size.attr, 868};
813 &physical_line_partition.attr, 869
814 &ways_of_associativity.attr, 870static struct attribute *default_l3_attrs[] = {
815 &number_of_sets.attr, 871 DEFAULT_SYSFS_CACHE_ATTRS,
816 &size.attr, 872#ifdef CONFIG_CPU_SUP_AMD
817 &shared_cpu_map.attr,
818 &shared_cpu_list.attr,
819 &cache_disable_0.attr, 873 &cache_disable_0.attr,
820 &cache_disable_1.attr, 874 &cache_disable_1.attr,
875#endif
821 NULL 876 NULL
822}; 877};
823 878
@@ -848,7 +903,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
848 return ret; 903 return ret;
849} 904}
850 905
851static struct sysfs_ops sysfs_ops = { 906static const struct sysfs_ops sysfs_ops = {
852 .show = show, 907 .show = show,
853 .store = store, 908 .store = store,
854}; 909};
@@ -908,6 +963,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
908 unsigned int cpu = sys_dev->id; 963 unsigned int cpu = sys_dev->id;
909 unsigned long i, j; 964 unsigned long i, j;
910 struct _index_kobject *this_object; 965 struct _index_kobject *this_object;
966 struct _cpuid4_info *this_leaf;
911 int retval; 967 int retval;
912 968
913 retval = cpuid4_cache_sysfs_init(cpu); 969 retval = cpuid4_cache_sysfs_init(cpu);
@@ -926,6 +982,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
926 this_object = INDEX_KOBJECT_PTR(cpu, i); 982 this_object = INDEX_KOBJECT_PTR(cpu, i);
927 this_object->cpu = cpu; 983 this_object->cpu = cpu;
928 this_object->index = i; 984 this_object->index = i;
985
986 this_leaf = CPUID4_INFO_IDX(cpu, i);
987
988 if (this_leaf->can_disable)
989 ktype_cache.default_attrs = default_l3_attrs;
990 else
991 ktype_cache.default_attrs = default_attrs;
992
929 retval = kobject_init_and_add(&(this_object->kobj), 993 retval = kobject_init_and_add(&(this_object->kobj),
930 &ktype_cache, 994 &ktype_cache,
931 per_cpu(ici_cache_kobject, cpu), 995 per_cpu(ici_cache_kobject, cpu),
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a8aacd4b513c..bd58de4d7a29 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -46,6 +46,11 @@
46 46
47#include "mce-internal.h" 47#include "mce-internal.h"
48 48
49#define rcu_dereference_check_mce(p) \
50 rcu_dereference_check((p), \
51 rcu_read_lock_sched_held() || \
52 lockdep_is_held(&mce_read_mutex))
53
49#define CREATE_TRACE_POINTS 54#define CREATE_TRACE_POINTS
50#include <trace/events/mce.h> 55#include <trace/events/mce.h>
51 56
@@ -158,7 +163,7 @@ void mce_log(struct mce *mce)
158 mce->finished = 0; 163 mce->finished = 0;
159 wmb(); 164 wmb();
160 for (;;) { 165 for (;;) {
161 entry = rcu_dereference(mcelog.next); 166 entry = rcu_dereference_check_mce(mcelog.next);
162 for (;;) { 167 for (;;) {
163 /* 168 /*
164 * When the buffer fills up discard new entries. 169 * When the buffer fills up discard new entries.
@@ -1500,7 +1505,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1500 return -ENOMEM; 1505 return -ENOMEM;
1501 1506
1502 mutex_lock(&mce_read_mutex); 1507 mutex_lock(&mce_read_mutex);
1503 next = rcu_dereference(mcelog.next); 1508 next = rcu_dereference_check_mce(mcelog.next);
1504 1509
1505 /* Only supports full reads right now */ 1510 /* Only supports full reads right now */
1506 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1511 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
@@ -1565,7 +1570,7 @@ timeout:
1565static unsigned int mce_poll(struct file *file, poll_table *wait) 1570static unsigned int mce_poll(struct file *file, poll_table *wait)
1566{ 1571{
1567 poll_wait(file, &mce_wait, wait); 1572 poll_wait(file, &mce_wait, wait);
1568 if (rcu_dereference(mcelog.next)) 1573 if (rcu_dereference_check_mce(mcelog.next))
1569 return POLLIN | POLLRDNORM; 1574 return POLLIN | POLLRDNORM;
1570 return 0; 1575 return 0;
1571} 1576}
@@ -2044,6 +2049,7 @@ static __init void mce_init_banks(void)
2044 struct mce_bank *b = &mce_banks[i]; 2049 struct mce_bank *b = &mce_banks[i];
2045 struct sysdev_attribute *a = &b->attr; 2050 struct sysdev_attribute *a = &b->attr;
2046 2051
2052 sysfs_attr_init(&a->attr);
2047 a->attr.name = b->attrname; 2053 a->attr.name = b->attrname;
2048 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2054 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2049 2055
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 83a3d1f4efca..cda932ca3ade 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -388,7 +388,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
388 return ret; 388 return ret;
389} 389}
390 390
391static struct sysfs_ops threshold_ops = { 391static const struct sysfs_ops threshold_ops = {
392 .show = show, 392 .show = show,
393 .store = store, 393 .store = store,
394}; 394};
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index f4361b56f8e9..ad9e5ed81181 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
1obj-y := main.o if.o generic.o state.o cleanup.o 1obj-y := main.o if.o generic.o cleanup.o
2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3 3
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 33af14110dfd..92ba9cd31c9a 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -108,7 +108,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
108 return 0; 108 return 0;
109} 109}
110 110
111static struct mtrr_ops amd_mtrr_ops = { 111static const struct mtrr_ops amd_mtrr_ops = {
112 .vendor = X86_VENDOR_AMD, 112 .vendor = X86_VENDOR_AMD,
113 .set = amd_set_mtrr, 113 .set = amd_set_mtrr,
114 .get = amd_get_mtrr, 114 .get = amd_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index de89f14eff3a..316fe3e60a97 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -110,7 +110,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
110 return 0; 110 return 0;
111} 111}
112 112
113static struct mtrr_ops centaur_mtrr_ops = { 113static const struct mtrr_ops centaur_mtrr_ops = {
114 .vendor = X86_VENDOR_CENTAUR, 114 .vendor = X86_VENDOR_CENTAUR,
115 .set = centaur_set_mcr, 115 .set = centaur_set_mcr,
116 .get = centaur_get_mcr, 116 .get = centaur_get_mcr,
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 09b1698e0466..06130b52f012 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -22,10 +22,10 @@
22#include <linux/pci.h> 22#include <linux/pci.h>
23#include <linux/smp.h> 23#include <linux/smp.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/sort.h>
26#include <linux/mutex.h> 25#include <linux/mutex.h>
27#include <linux/uaccess.h> 26#include <linux/uaccess.h>
28#include <linux/kvm_para.h> 27#include <linux/kvm_para.h>
28#include <linux/range.h>
29 29
30#include <asm/processor.h> 30#include <asm/processor.h>
31#include <asm/e820.h> 31#include <asm/e820.h>
@@ -34,11 +34,6 @@
34 34
35#include "mtrr.h" 35#include "mtrr.h"
36 36
37struct res_range {
38 unsigned long start;
39 unsigned long end;
40};
41
42struct var_mtrr_range_state { 37struct var_mtrr_range_state {
43 unsigned long base_pfn; 38 unsigned long base_pfn;
44 unsigned long size_pfn; 39 unsigned long size_pfn;
@@ -56,7 +51,7 @@ struct var_mtrr_state {
56/* Should be related to MTRR_VAR_RANGES nums */ 51/* Should be related to MTRR_VAR_RANGES nums */
57#define RANGE_NUM 256 52#define RANGE_NUM 256
58 53
59static struct res_range __initdata range[RANGE_NUM]; 54static struct range __initdata range[RANGE_NUM];
60static int __initdata nr_range; 55static int __initdata nr_range;
61 56
62static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; 57static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
@@ -64,152 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
64static int __initdata debug_print; 59static int __initdata debug_print;
65#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) 60#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
66 61
67
68static int __init
69add_range(struct res_range *range, int nr_range,
70 unsigned long start, unsigned long end)
71{
72 /* Out of slots: */
73 if (nr_range >= RANGE_NUM)
74 return nr_range;
75
76 range[nr_range].start = start;
77 range[nr_range].end = end;
78
79 nr_range++;
80
81 return nr_range;
82}
83
84static int __init
85add_range_with_merge(struct res_range *range, int nr_range,
86 unsigned long start, unsigned long end)
87{
88 int i;
89
90 /* Try to merge it with old one: */
91 for (i = 0; i < nr_range; i++) {
92 unsigned long final_start, final_end;
93 unsigned long common_start, common_end;
94
95 if (!range[i].end)
96 continue;
97
98 common_start = max(range[i].start, start);
99 common_end = min(range[i].end, end);
100 if (common_start > common_end + 1)
101 continue;
102
103 final_start = min(range[i].start, start);
104 final_end = max(range[i].end, end);
105
106 range[i].start = final_start;
107 range[i].end = final_end;
108 return nr_range;
109 }
110
111 /* Need to add it: */
112 return add_range(range, nr_range, start, end);
113}
114
115static void __init
116subtract_range(struct res_range *range, unsigned long start, unsigned long end)
117{
118 int i, j;
119
120 for (j = 0; j < RANGE_NUM; j++) {
121 if (!range[j].end)
122 continue;
123
124 if (start <= range[j].start && end >= range[j].end) {
125 range[j].start = 0;
126 range[j].end = 0;
127 continue;
128 }
129
130 if (start <= range[j].start && end < range[j].end &&
131 range[j].start < end + 1) {
132 range[j].start = end + 1;
133 continue;
134 }
135
136
137 if (start > range[j].start && end >= range[j].end &&
138 range[j].end > start - 1) {
139 range[j].end = start - 1;
140 continue;
141 }
142
143 if (start > range[j].start && end < range[j].end) {
144 /* Find the new spare: */
145 for (i = 0; i < RANGE_NUM; i++) {
146 if (range[i].end == 0)
147 break;
148 }
149 if (i < RANGE_NUM) {
150 range[i].end = range[j].end;
151 range[i].start = end + 1;
152 } else {
153 printk(KERN_ERR "run of slot in ranges\n");
154 }
155 range[j].end = start - 1;
156 continue;
157 }
158 }
159}
160
161static int __init cmp_range(const void *x1, const void *x2)
162{
163 const struct res_range *r1 = x1;
164 const struct res_range *r2 = x2;
165 long start1, start2;
166
167 start1 = r1->start;
168 start2 = r2->start;
169
170 return start1 - start2;
171}
172
173static int __init clean_sort_range(struct res_range *range, int az)
174{
175 int i, j, k = az - 1, nr_range = 0;
176
177 for (i = 0; i < k; i++) {
178 if (range[i].end)
179 continue;
180 for (j = k; j > i; j--) {
181 if (range[j].end) {
182 k = j;
183 break;
184 }
185 }
186 if (j == i)
187 break;
188 range[i].start = range[k].start;
189 range[i].end = range[k].end;
190 range[k].start = 0;
191 range[k].end = 0;
192 k--;
193 }
194 /* count it */
195 for (i = 0; i < az; i++) {
196 if (!range[i].end) {
197 nr_range = i;
198 break;
199 }
200 }
201
202 /* sort them */
203 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
204
205 return nr_range;
206}
207
208#define BIOS_BUG_MSG KERN_WARNING \ 62#define BIOS_BUG_MSG KERN_WARNING \
209 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" 63 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
210 64
211static int __init 65static int __init
212x86_get_mtrr_mem_range(struct res_range *range, int nr_range, 66x86_get_mtrr_mem_range(struct range *range, int nr_range,
213 unsigned long extra_remove_base, 67 unsigned long extra_remove_base,
214 unsigned long extra_remove_size) 68 unsigned long extra_remove_size)
215{ 69{
@@ -223,14 +77,14 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
223 continue; 77 continue;
224 base = range_state[i].base_pfn; 78 base = range_state[i].base_pfn;
225 size = range_state[i].size_pfn; 79 size = range_state[i].size_pfn;
226 nr_range = add_range_with_merge(range, nr_range, base, 80 nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
227 base + size - 1); 81 base, base + size);
228 } 82 }
229 if (debug_print) { 83 if (debug_print) {
230 printk(KERN_DEBUG "After WB checking\n"); 84 printk(KERN_DEBUG "After WB checking\n");
231 for (i = 0; i < nr_range; i++) 85 for (i = 0; i < nr_range; i++)
232 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 86 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
233 range[i].start, range[i].end + 1); 87 range[i].start, range[i].end);
234 } 88 }
235 89
236 /* Take out UC ranges: */ 90 /* Take out UC ranges: */
@@ -252,19 +106,19 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
252 size -= (1<<(20-PAGE_SHIFT)) - base; 106 size -= (1<<(20-PAGE_SHIFT)) - base;
253 base = 1<<(20-PAGE_SHIFT); 107 base = 1<<(20-PAGE_SHIFT);
254 } 108 }
255 subtract_range(range, base, base + size - 1); 109 subtract_range(range, RANGE_NUM, base, base + size);
256 } 110 }
257 if (extra_remove_size) 111 if (extra_remove_size)
258 subtract_range(range, extra_remove_base, 112 subtract_range(range, RANGE_NUM, extra_remove_base,
259 extra_remove_base + extra_remove_size - 1); 113 extra_remove_base + extra_remove_size);
260 114
261 if (debug_print) { 115 if (debug_print) {
262 printk(KERN_DEBUG "After UC checking\n"); 116 printk(KERN_DEBUG "After UC checking\n");
263 for (i = 0; i < RANGE_NUM; i++) { 117 for (i = 0; i < RANGE_NUM; i++) {
264 if (!range[i].end) 118 if (!range[i].end)
265 continue; 119 continue;
266 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 120 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
267 range[i].start, range[i].end + 1); 121 range[i].start, range[i].end);
268 } 122 }
269 } 123 }
270 124
@@ -273,26 +127,22 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
273 if (debug_print) { 127 if (debug_print) {
274 printk(KERN_DEBUG "After sorting\n"); 128 printk(KERN_DEBUG "After sorting\n");
275 for (i = 0; i < nr_range; i++) 129 for (i = 0; i < nr_range; i++)
276 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 130 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
277 range[i].start, range[i].end + 1); 131 range[i].start, range[i].end);
278 } 132 }
279 133
280 /* clear those is not used */
281 for (i = nr_range; i < RANGE_NUM; i++)
282 memset(&range[i], 0, sizeof(range[i]));
283
284 return nr_range; 134 return nr_range;
285} 135}
286 136
287#ifdef CONFIG_MTRR_SANITIZER 137#ifdef CONFIG_MTRR_SANITIZER
288 138
289static unsigned long __init sum_ranges(struct res_range *range, int nr_range) 139static unsigned long __init sum_ranges(struct range *range, int nr_range)
290{ 140{
291 unsigned long sum = 0; 141 unsigned long sum = 0;
292 int i; 142 int i;
293 143
294 for (i = 0; i < nr_range; i++) 144 for (i = 0; i < nr_range; i++)
295 sum += range[i].end + 1 - range[i].start; 145 sum += range[i].end - range[i].start;
296 146
297 return sum; 147 return sum;
298} 148}
@@ -621,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg)
621early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); 471early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
622 472
623static int __init 473static int __init
624x86_setup_var_mtrrs(struct res_range *range, int nr_range, 474x86_setup_var_mtrrs(struct range *range, int nr_range,
625 u64 chunk_size, u64 gran_size) 475 u64 chunk_size, u64 gran_size)
626{ 476{
627 struct var_mtrr_state var_state; 477 struct var_mtrr_state var_state;
@@ -639,7 +489,7 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
639 /* Write the range: */ 489 /* Write the range: */
640 for (i = 0; i < nr_range; i++) { 490 for (i = 0; i < nr_range; i++) {
641 set_var_mtrr_range(&var_state, range[i].start, 491 set_var_mtrr_range(&var_state, range[i].start,
642 range[i].end - range[i].start + 1); 492 range[i].end - range[i].start);
643 } 493 }
644 494
645 /* Write the last range: */ 495 /* Write the last range: */
@@ -742,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
742 unsigned long x_remove_base, 592 unsigned long x_remove_base,
743 unsigned long x_remove_size, int i) 593 unsigned long x_remove_size, int i)
744{ 594{
745 static struct res_range range_new[RANGE_NUM]; 595 static struct range range_new[RANGE_NUM];
746 unsigned long range_sums_new; 596 unsigned long range_sums_new;
747 static int nr_range_new; 597 static int nr_range_new;
748 int num_reg; 598 int num_reg;
@@ -869,10 +719,10 @@ int __init mtrr_cleanup(unsigned address_bits)
869 * [0, 1M) should always be covered by var mtrr with WB 719 * [0, 1M) should always be covered by var mtrr with WB
870 * and fixed mtrrs should take effect before var mtrr for it: 720 * and fixed mtrrs should take effect before var mtrr for it:
871 */ 721 */
872 nr_range = add_range_with_merge(range, nr_range, 0, 722 nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0,
873 (1ULL<<(20 - PAGE_SHIFT)) - 1); 723 1ULL<<(20 - PAGE_SHIFT));
874 /* Sort the ranges: */ 724 /* Sort the ranges: */
875 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 725 sort_range(range, nr_range);
876 726
877 range_sums = sum_ranges(range, nr_range); 727 range_sums = sum_ranges(range, nr_range);
878 printk(KERN_INFO "total RAM covered: %ldM\n", 728 printk(KERN_INFO "total RAM covered: %ldM\n",
@@ -1089,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1089 nr_range = 0; 939 nr_range = 0;
1090 if (mtrr_tom2) { 940 if (mtrr_tom2) {
1091 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); 941 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1092 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; 942 range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
1093 if (highest_pfn < range[nr_range].end + 1) 943 if (highest_pfn < range[nr_range].end)
1094 highest_pfn = range[nr_range].end + 1; 944 highest_pfn = range[nr_range].end;
1095 nr_range++; 945 nr_range++;
1096 } 946 }
1097 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); 947 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
@@ -1103,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1103 953
1104 /* Check the holes: */ 954 /* Check the holes: */
1105 for (i = 0; i < nr_range - 1; i++) { 955 for (i = 0; i < nr_range - 1; i++) {
1106 if (range[i].end + 1 < range[i+1].start) 956 if (range[i].end < range[i+1].start)
1107 total_trim_size += real_trim_memory(range[i].end + 1, 957 total_trim_size += real_trim_memory(range[i].end,
1108 range[i+1].start); 958 range[i+1].start);
1109 } 959 }
1110 960
1111 /* Check the top: */ 961 /* Check the top: */
1112 i = nr_range - 1; 962 i = nr_range - 1;
1113 if (range[i].end + 1 < end_pfn) 963 if (range[i].end < end_pfn)
1114 total_trim_size += real_trim_memory(range[i].end + 1, 964 total_trim_size += real_trim_memory(range[i].end,
1115 end_pfn); 965 end_pfn);
1116 966
1117 if (total_trim_size) { 967 if (total_trim_size) {
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 228d982ce09c..68a3343e5798 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -265,7 +265,7 @@ static void cyrix_set_all(void)
265 post_set(); 265 post_set();
266} 266}
267 267
268static struct mtrr_ops cyrix_mtrr_ops = { 268static const struct mtrr_ops cyrix_mtrr_ops = {
269 .vendor = X86_VENDOR_CYRIX, 269 .vendor = X86_VENDOR_CYRIX,
270 .set_all = cyrix_set_all, 270 .set_all = cyrix_set_all,
271 .set = cyrix_set_arr, 271 .set = cyrix_set_arr,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 55da0c5f68dd..9aa5dc76ff4a 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -464,7 +464,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
464 tmp |= ~((1<<(hi - 1)) - 1); 464 tmp |= ~((1<<(hi - 1)) - 1);
465 465
466 if (tmp != mask_lo) { 466 if (tmp != mask_lo) {
467 WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); 467 printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
468 mask_lo = tmp; 468 mask_lo = tmp;
469 } 469 }
470 } 470 }
@@ -570,7 +570,7 @@ static unsigned long set_mtrr_state(void)
570 570
571 571
572static unsigned long cr4; 572static unsigned long cr4;
573static DEFINE_SPINLOCK(set_atomicity_lock); 573static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
574 574
575/* 575/*
576 * Since we are disabling the cache don't allow any interrupts, 576 * Since we are disabling the cache don't allow any interrupts,
@@ -590,7 +590,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
590 * changes to the way the kernel boots 590 * changes to the way the kernel boots
591 */ 591 */
592 592
593 spin_lock(&set_atomicity_lock); 593 raw_spin_lock(&set_atomicity_lock);
594 594
595 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ 595 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
596 cr0 = read_cr0() | X86_CR0_CD; 596 cr0 = read_cr0() | X86_CR0_CD;
@@ -627,7 +627,7 @@ static void post_set(void) __releases(set_atomicity_lock)
627 /* Restore value of CR4 */ 627 /* Restore value of CR4 */
628 if (cpu_has_pge) 628 if (cpu_has_pge)
629 write_cr4(cr4); 629 write_cr4(cr4);
630 spin_unlock(&set_atomicity_lock); 630 raw_spin_unlock(&set_atomicity_lock);
631} 631}
632 632
633static void generic_set_all(void) 633static void generic_set_all(void)
@@ -752,7 +752,7 @@ int positive_have_wrcomb(void)
752/* 752/*
753 * Generic structure... 753 * Generic structure...
754 */ 754 */
755struct mtrr_ops generic_mtrr_ops = { 755const struct mtrr_ops generic_mtrr_ops = {
756 .use_intel_if = 1, 756 .use_intel_if = 1,
757 .set_all = generic_set_all, 757 .set_all = generic_set_all,
758 .get = generic_get_mtrr, 758 .get = generic_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 84e83de54575..79556bd9b602 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -60,14 +60,14 @@ static DEFINE_MUTEX(mtrr_mutex);
60u64 size_or_mask, size_and_mask; 60u64 size_or_mask, size_and_mask;
61static bool mtrr_aps_delayed_init; 61static bool mtrr_aps_delayed_init;
62 62
63static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; 63static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
64 64
65struct mtrr_ops *mtrr_if; 65const struct mtrr_ops *mtrr_if;
66 66
67static void set_mtrr(unsigned int reg, unsigned long base, 67static void set_mtrr(unsigned int reg, unsigned long base,
68 unsigned long size, mtrr_type type); 68 unsigned long size, mtrr_type type);
69 69
70void set_mtrr_ops(struct mtrr_ops *ops) 70void set_mtrr_ops(const struct mtrr_ops *ops)
71{ 71{
72 if (ops->vendor && ops->vendor < X86_VENDOR_NUM) 72 if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
73 mtrr_ops[ops->vendor] = ops; 73 mtrr_ops[ops->vendor] = ops;
@@ -145,6 +145,7 @@ struct set_mtrr_data {
145 145
146/** 146/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs. 147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
148 * @info: pointer to mtrr configuration data
148 * 149 *
149 * Returns nothing. 150 * Returns nothing.
150 */ 151 */
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index a501dee9a87a..df5e41f31a27 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -32,7 +32,7 @@ extern int generic_get_free_region(unsigned long base, unsigned long size,
32extern int generic_validate_add_page(unsigned long base, unsigned long size, 32extern int generic_validate_add_page(unsigned long base, unsigned long size,
33 unsigned int type); 33 unsigned int type);
34 34
35extern struct mtrr_ops generic_mtrr_ops; 35extern const struct mtrr_ops generic_mtrr_ops;
36 36
37extern int positive_have_wrcomb(void); 37extern int positive_have_wrcomb(void);
38 38
@@ -53,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index,
53 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); 53 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
54void get_mtrr_state(void); 54void get_mtrr_state(void);
55 55
56extern void set_mtrr_ops(struct mtrr_ops *ops); 56extern void set_mtrr_ops(const struct mtrr_ops *ops);
57 57
58extern u64 size_or_mask, size_and_mask; 58extern u64 size_or_mask, size_and_mask;
59extern struct mtrr_ops *mtrr_if; 59extern const struct mtrr_ops *mtrr_if;
60 60
61#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) 61#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
62#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) 62#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
deleted file mode 100644
index dfc80b4e6b0d..000000000000
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ /dev/null
@@ -1,94 +0,0 @@
1#include <linux/init.h>
2#include <linux/io.h>
3#include <linux/mm.h>
4
5#include <asm/processor-cyrix.h>
6#include <asm/processor-flags.h>
7#include <asm/mtrr.h>
8#include <asm/msr.h>
9
10#include "mtrr.h"
11
12/* Put the processor into a state where MTRRs can be safely set */
13void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
14{
15 unsigned int cr0;
16
17 /* Disable interrupts locally */
18 local_irq_save(ctxt->flags);
19
20 if (use_intel() || is_cpu(CYRIX)) {
21
22 /* Save value of CR4 and clear Page Global Enable (bit 7) */
23 if (cpu_has_pge) {
24 ctxt->cr4val = read_cr4();
25 write_cr4(ctxt->cr4val & ~X86_CR4_PGE);
26 }
27
28 /*
29 * Disable and flush caches. Note that wbinvd flushes the TLBs
30 * as a side-effect
31 */
32 cr0 = read_cr0() | X86_CR0_CD;
33 wbinvd();
34 write_cr0(cr0);
35 wbinvd();
36
37 if (use_intel()) {
38 /* Save MTRR state */
39 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
40 } else {
41 /*
42 * Cyrix ARRs -
43 * everything else were excluded at the top
44 */
45 ctxt->ccr3 = getCx86(CX86_CCR3);
46 }
47 }
48}
49
50void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
51{
52 if (use_intel()) {
53 /* Disable MTRRs, and set the default type to uncached */
54 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
55 ctxt->deftype_hi);
56 } else {
57 if (is_cpu(CYRIX)) {
58 /* Cyrix ARRs - everything else were excluded at the top */
59 setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10);
60 }
61 }
62}
63
64/* Restore the processor after a set_mtrr_prepare */
65void set_mtrr_done(struct set_mtrr_context *ctxt)
66{
67 if (use_intel() || is_cpu(CYRIX)) {
68
69 /* Flush caches and TLBs */
70 wbinvd();
71
72 /* Restore MTRRdefType */
73 if (use_intel()) {
74 /* Intel (P6) standard MTRRs */
75 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo,
76 ctxt->deftype_hi);
77 } else {
78 /*
79 * Cyrix ARRs -
80 * everything else was excluded at the top
81 */
82 setCx86(CX86_CCR3, ctxt->ccr3);
83 }
84
85 /* Enable caches */
86 write_cr0(read_cr0() & 0xbfffffff);
87
88 /* Restore value of CR4 */
89 if (cpu_has_pge)
90 write_cr4(ctxt->cr4val);
91 }
92 /* Re-enable interrupts locally (if enabled previously) */
93 local_irq_restore(ctxt->flags);
94}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8c1c07073ccc..42aafd11e170 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -7,6 +7,7 @@
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> 9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10 * Copyright (C) 2009 Google, Inc., Stephane Eranian
10 * 11 *
11 * For licencing details see kernel-base/COPYING 12 * For licencing details see kernel-base/COPYING
12 */ 13 */
@@ -22,6 +23,7 @@
22#include <linux/uaccess.h> 23#include <linux/uaccess.h>
23#include <linux/highmem.h> 24#include <linux/highmem.h>
24#include <linux/cpu.h> 25#include <linux/cpu.h>
26#include <linux/bitops.h>
25 27
26#include <asm/apic.h> 28#include <asm/apic.h>
27#include <asm/stacktrace.h> 29#include <asm/stacktrace.h>
@@ -68,26 +70,59 @@ struct debug_store {
68 u64 pebs_event_reset[MAX_PEBS_EVENTS]; 70 u64 pebs_event_reset[MAX_PEBS_EVENTS];
69}; 71};
70 72
73struct event_constraint {
74 union {
75 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
76 u64 idxmsk64;
77 };
78 u64 code;
79 u64 cmask;
80 int weight;
81};
82
83struct amd_nb {
84 int nb_id; /* NorthBridge id */
85 int refcnt; /* reference count */
86 struct perf_event *owners[X86_PMC_IDX_MAX];
87 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
88};
89
71struct cpu_hw_events { 90struct cpu_hw_events {
72 struct perf_event *events[X86_PMC_IDX_MAX]; 91 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
73 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
74 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 92 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
75 unsigned long interrupts; 93 unsigned long interrupts;
76 int enabled; 94 int enabled;
77 struct debug_store *ds; 95 struct debug_store *ds;
78};
79 96
80struct event_constraint { 97 int n_events;
81 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 98 int n_added;
82 int code; 99 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
100 u64 tags[X86_PMC_IDX_MAX];
101 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
102 struct amd_nb *amd_nb;
83}; 103};
84 104
85#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) } 105#define __EVENT_CONSTRAINT(c, n, m, w) {\
86#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 } 106 { .idxmsk64 = (n) }, \
107 .code = (c), \
108 .cmask = (m), \
109 .weight = (w), \
110}
111
112#define EVENT_CONSTRAINT(c, n, m) \
113 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
87 114
88#define for_each_event_constraint(e, c) \ 115#define INTEL_EVENT_CONSTRAINT(c, n) \
89 for ((e) = (c); (e)->idxmsk[0]; (e)++) 116 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
90 117
118#define FIXED_EVENT_CONSTRAINT(c, n) \
119 EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
120
121#define EVENT_CONSTRAINT_END \
122 EVENT_CONSTRAINT(0, 0, 0)
123
124#define for_each_event_constraint(e, c) \
125 for ((e) = (c); (e)->cmask; (e)++)
91 126
92/* 127/*
93 * struct x86_pmu - generic x86 pmu 128 * struct x86_pmu - generic x86 pmu
@@ -114,8 +149,14 @@ struct x86_pmu {
114 u64 intel_ctrl; 149 u64 intel_ctrl;
115 void (*enable_bts)(u64 config); 150 void (*enable_bts)(u64 config);
116 void (*disable_bts)(void); 151 void (*disable_bts)(void);
117 int (*get_event_idx)(struct cpu_hw_events *cpuc, 152
118 struct hw_perf_event *hwc); 153 struct event_constraint *
154 (*get_event_constraints)(struct cpu_hw_events *cpuc,
155 struct perf_event *event);
156
157 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
158 struct perf_event *event);
159 struct event_constraint *event_constraints;
119}; 160};
120 161
121static struct x86_pmu x86_pmu __read_mostly; 162static struct x86_pmu x86_pmu __read_mostly;
@@ -124,111 +165,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
124 .enabled = 1, 165 .enabled = 1,
125}; 166};
126 167
127static const struct event_constraint *event_constraints; 168static int x86_perf_event_set_period(struct perf_event *event,
128 169 struct hw_perf_event *hwc, int idx);
129/*
130 * Not sure about some of these
131 */
132static const u64 p6_perfmon_event_map[] =
133{
134 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
135 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
136 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
137 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
138 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
139 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
140 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
141};
142
143static u64 p6_pmu_event_map(int hw_event)
144{
145 return p6_perfmon_event_map[hw_event];
146}
147
148/*
149 * Event setting that is specified not to count anything.
150 * We use this to effectively disable a counter.
151 *
152 * L2_RQSTS with 0 MESI unit mask.
153 */
154#define P6_NOP_EVENT 0x0000002EULL
155
156static u64 p6_pmu_raw_event(u64 hw_event)
157{
158#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
159#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
160#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
161#define P6_EVNTSEL_INV_MASK 0x00800000ULL
162#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
163
164#define P6_EVNTSEL_MASK \
165 (P6_EVNTSEL_EVENT_MASK | \
166 P6_EVNTSEL_UNIT_MASK | \
167 P6_EVNTSEL_EDGE_MASK | \
168 P6_EVNTSEL_INV_MASK | \
169 P6_EVNTSEL_REG_MASK)
170
171 return hw_event & P6_EVNTSEL_MASK;
172}
173
174static const struct event_constraint intel_p6_event_constraints[] =
175{
176 EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
177 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
178 EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
179 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
180 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
181 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
182 EVENT_CONSTRAINT_END
183};
184
185/*
186 * Intel PerfMon v3. Used on Core2 and later.
187 */
188static const u64 intel_perfmon_event_map[] =
189{
190 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
191 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
192 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
193 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
194 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
195 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
196 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
197};
198
199static const struct event_constraint intel_core_event_constraints[] =
200{
201 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
202 EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
203 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
204 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
205 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
206 EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
207 EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
208 EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
209 EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
210 EVENT_CONSTRAINT_END
211};
212
213static const struct event_constraint intel_nehalem_event_constraints[] =
214{
215 EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
216 EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
217 EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
218 EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
219 EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
220 EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
221 EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
222 EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
223 EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
224 EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
225 EVENT_CONSTRAINT_END
226};
227
228static u64 intel_pmu_event_map(int hw_event)
229{
230 return intel_perfmon_event_map[hw_event];
231}
232 170
233/* 171/*
234 * Generalized hw caching related hw_event table, filled 172 * Generalized hw caching related hw_event table, filled
@@ -245,424 +183,6 @@ static u64 __read_mostly hw_cache_event_ids
245 [PERF_COUNT_HW_CACHE_OP_MAX] 183 [PERF_COUNT_HW_CACHE_OP_MAX]
246 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 184 [PERF_COUNT_HW_CACHE_RESULT_MAX];
247 185
248static __initconst u64 nehalem_hw_cache_event_ids
249 [PERF_COUNT_HW_CACHE_MAX]
250 [PERF_COUNT_HW_CACHE_OP_MAX]
251 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
252{
253 [ C(L1D) ] = {
254 [ C(OP_READ) ] = {
255 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
256 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
257 },
258 [ C(OP_WRITE) ] = {
259 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
260 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
261 },
262 [ C(OP_PREFETCH) ] = {
263 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
264 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
265 },
266 },
267 [ C(L1I ) ] = {
268 [ C(OP_READ) ] = {
269 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
270 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
271 },
272 [ C(OP_WRITE) ] = {
273 [ C(RESULT_ACCESS) ] = -1,
274 [ C(RESULT_MISS) ] = -1,
275 },
276 [ C(OP_PREFETCH) ] = {
277 [ C(RESULT_ACCESS) ] = 0x0,
278 [ C(RESULT_MISS) ] = 0x0,
279 },
280 },
281 [ C(LL ) ] = {
282 [ C(OP_READ) ] = {
283 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
284 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
285 },
286 [ C(OP_WRITE) ] = {
287 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
288 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
289 },
290 [ C(OP_PREFETCH) ] = {
291 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
292 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
293 },
294 },
295 [ C(DTLB) ] = {
296 [ C(OP_READ) ] = {
297 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
298 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
299 },
300 [ C(OP_WRITE) ] = {
301 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
302 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
303 },
304 [ C(OP_PREFETCH) ] = {
305 [ C(RESULT_ACCESS) ] = 0x0,
306 [ C(RESULT_MISS) ] = 0x0,
307 },
308 },
309 [ C(ITLB) ] = {
310 [ C(OP_READ) ] = {
311 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
312 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
313 },
314 [ C(OP_WRITE) ] = {
315 [ C(RESULT_ACCESS) ] = -1,
316 [ C(RESULT_MISS) ] = -1,
317 },
318 [ C(OP_PREFETCH) ] = {
319 [ C(RESULT_ACCESS) ] = -1,
320 [ C(RESULT_MISS) ] = -1,
321 },
322 },
323 [ C(BPU ) ] = {
324 [ C(OP_READ) ] = {
325 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
326 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
327 },
328 [ C(OP_WRITE) ] = {
329 [ C(RESULT_ACCESS) ] = -1,
330 [ C(RESULT_MISS) ] = -1,
331 },
332 [ C(OP_PREFETCH) ] = {
333 [ C(RESULT_ACCESS) ] = -1,
334 [ C(RESULT_MISS) ] = -1,
335 },
336 },
337};
338
339static __initconst u64 core2_hw_cache_event_ids
340 [PERF_COUNT_HW_CACHE_MAX]
341 [PERF_COUNT_HW_CACHE_OP_MAX]
342 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
343{
344 [ C(L1D) ] = {
345 [ C(OP_READ) ] = {
346 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
347 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
348 },
349 [ C(OP_WRITE) ] = {
350 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
351 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
352 },
353 [ C(OP_PREFETCH) ] = {
354 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
355 [ C(RESULT_MISS) ] = 0,
356 },
357 },
358 [ C(L1I ) ] = {
359 [ C(OP_READ) ] = {
360 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
361 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
362 },
363 [ C(OP_WRITE) ] = {
364 [ C(RESULT_ACCESS) ] = -1,
365 [ C(RESULT_MISS) ] = -1,
366 },
367 [ C(OP_PREFETCH) ] = {
368 [ C(RESULT_ACCESS) ] = 0,
369 [ C(RESULT_MISS) ] = 0,
370 },
371 },
372 [ C(LL ) ] = {
373 [ C(OP_READ) ] = {
374 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
375 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
376 },
377 [ C(OP_WRITE) ] = {
378 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
379 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
380 },
381 [ C(OP_PREFETCH) ] = {
382 [ C(RESULT_ACCESS) ] = 0,
383 [ C(RESULT_MISS) ] = 0,
384 },
385 },
386 [ C(DTLB) ] = {
387 [ C(OP_READ) ] = {
388 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
389 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
390 },
391 [ C(OP_WRITE) ] = {
392 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
393 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
394 },
395 [ C(OP_PREFETCH) ] = {
396 [ C(RESULT_ACCESS) ] = 0,
397 [ C(RESULT_MISS) ] = 0,
398 },
399 },
400 [ C(ITLB) ] = {
401 [ C(OP_READ) ] = {
402 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
403 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
404 },
405 [ C(OP_WRITE) ] = {
406 [ C(RESULT_ACCESS) ] = -1,
407 [ C(RESULT_MISS) ] = -1,
408 },
409 [ C(OP_PREFETCH) ] = {
410 [ C(RESULT_ACCESS) ] = -1,
411 [ C(RESULT_MISS) ] = -1,
412 },
413 },
414 [ C(BPU ) ] = {
415 [ C(OP_READ) ] = {
416 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
417 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
418 },
419 [ C(OP_WRITE) ] = {
420 [ C(RESULT_ACCESS) ] = -1,
421 [ C(RESULT_MISS) ] = -1,
422 },
423 [ C(OP_PREFETCH) ] = {
424 [ C(RESULT_ACCESS) ] = -1,
425 [ C(RESULT_MISS) ] = -1,
426 },
427 },
428};
429
430static __initconst u64 atom_hw_cache_event_ids
431 [PERF_COUNT_HW_CACHE_MAX]
432 [PERF_COUNT_HW_CACHE_OP_MAX]
433 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
434{
435 [ C(L1D) ] = {
436 [ C(OP_READ) ] = {
437 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
438 [ C(RESULT_MISS) ] = 0,
439 },
440 [ C(OP_WRITE) ] = {
441 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
442 [ C(RESULT_MISS) ] = 0,
443 },
444 [ C(OP_PREFETCH) ] = {
445 [ C(RESULT_ACCESS) ] = 0x0,
446 [ C(RESULT_MISS) ] = 0,
447 },
448 },
449 [ C(L1I ) ] = {
450 [ C(OP_READ) ] = {
451 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
452 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
453 },
454 [ C(OP_WRITE) ] = {
455 [ C(RESULT_ACCESS) ] = -1,
456 [ C(RESULT_MISS) ] = -1,
457 },
458 [ C(OP_PREFETCH) ] = {
459 [ C(RESULT_ACCESS) ] = 0,
460 [ C(RESULT_MISS) ] = 0,
461 },
462 },
463 [ C(LL ) ] = {
464 [ C(OP_READ) ] = {
465 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
466 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
467 },
468 [ C(OP_WRITE) ] = {
469 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
470 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
471 },
472 [ C(OP_PREFETCH) ] = {
473 [ C(RESULT_ACCESS) ] = 0,
474 [ C(RESULT_MISS) ] = 0,
475 },
476 },
477 [ C(DTLB) ] = {
478 [ C(OP_READ) ] = {
479 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
480 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
481 },
482 [ C(OP_WRITE) ] = {
483 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
484 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
485 },
486 [ C(OP_PREFETCH) ] = {
487 [ C(RESULT_ACCESS) ] = 0,
488 [ C(RESULT_MISS) ] = 0,
489 },
490 },
491 [ C(ITLB) ] = {
492 [ C(OP_READ) ] = {
493 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
494 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
495 },
496 [ C(OP_WRITE) ] = {
497 [ C(RESULT_ACCESS) ] = -1,
498 [ C(RESULT_MISS) ] = -1,
499 },
500 [ C(OP_PREFETCH) ] = {
501 [ C(RESULT_ACCESS) ] = -1,
502 [ C(RESULT_MISS) ] = -1,
503 },
504 },
505 [ C(BPU ) ] = {
506 [ C(OP_READ) ] = {
507 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
508 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
509 },
510 [ C(OP_WRITE) ] = {
511 [ C(RESULT_ACCESS) ] = -1,
512 [ C(RESULT_MISS) ] = -1,
513 },
514 [ C(OP_PREFETCH) ] = {
515 [ C(RESULT_ACCESS) ] = -1,
516 [ C(RESULT_MISS) ] = -1,
517 },
518 },
519};
520
521static u64 intel_pmu_raw_event(u64 hw_event)
522{
523#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
524#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
525#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
526#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
527#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
528
529#define CORE_EVNTSEL_MASK \
530 (CORE_EVNTSEL_EVENT_MASK | \
531 CORE_EVNTSEL_UNIT_MASK | \
532 CORE_EVNTSEL_EDGE_MASK | \
533 CORE_EVNTSEL_INV_MASK | \
534 CORE_EVNTSEL_REG_MASK)
535
536 return hw_event & CORE_EVNTSEL_MASK;
537}
538
539static __initconst u64 amd_hw_cache_event_ids
540 [PERF_COUNT_HW_CACHE_MAX]
541 [PERF_COUNT_HW_CACHE_OP_MAX]
542 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
543{
544 [ C(L1D) ] = {
545 [ C(OP_READ) ] = {
546 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
547 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
548 },
549 [ C(OP_WRITE) ] = {
550 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
551 [ C(RESULT_MISS) ] = 0,
552 },
553 [ C(OP_PREFETCH) ] = {
554 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
555 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
556 },
557 },
558 [ C(L1I ) ] = {
559 [ C(OP_READ) ] = {
560 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
561 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
562 },
563 [ C(OP_WRITE) ] = {
564 [ C(RESULT_ACCESS) ] = -1,
565 [ C(RESULT_MISS) ] = -1,
566 },
567 [ C(OP_PREFETCH) ] = {
568 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
569 [ C(RESULT_MISS) ] = 0,
570 },
571 },
572 [ C(LL ) ] = {
573 [ C(OP_READ) ] = {
574 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
575 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
576 },
577 [ C(OP_WRITE) ] = {
578 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
579 [ C(RESULT_MISS) ] = 0,
580 },
581 [ C(OP_PREFETCH) ] = {
582 [ C(RESULT_ACCESS) ] = 0,
583 [ C(RESULT_MISS) ] = 0,
584 },
585 },
586 [ C(DTLB) ] = {
587 [ C(OP_READ) ] = {
588 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
589 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
590 },
591 [ C(OP_WRITE) ] = {
592 [ C(RESULT_ACCESS) ] = 0,
593 [ C(RESULT_MISS) ] = 0,
594 },
595 [ C(OP_PREFETCH) ] = {
596 [ C(RESULT_ACCESS) ] = 0,
597 [ C(RESULT_MISS) ] = 0,
598 },
599 },
600 [ C(ITLB) ] = {
601 [ C(OP_READ) ] = {
602 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
603 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
604 },
605 [ C(OP_WRITE) ] = {
606 [ C(RESULT_ACCESS) ] = -1,
607 [ C(RESULT_MISS) ] = -1,
608 },
609 [ C(OP_PREFETCH) ] = {
610 [ C(RESULT_ACCESS) ] = -1,
611 [ C(RESULT_MISS) ] = -1,
612 },
613 },
614 [ C(BPU ) ] = {
615 [ C(OP_READ) ] = {
616 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
617 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
618 },
619 [ C(OP_WRITE) ] = {
620 [ C(RESULT_ACCESS) ] = -1,
621 [ C(RESULT_MISS) ] = -1,
622 },
623 [ C(OP_PREFETCH) ] = {
624 [ C(RESULT_ACCESS) ] = -1,
625 [ C(RESULT_MISS) ] = -1,
626 },
627 },
628};
629
630/*
631 * AMD Performance Monitor K7 and later.
632 */
633static const u64 amd_perfmon_event_map[] =
634{
635 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
636 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
637 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
638 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
639 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
640 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
641};
642
643static u64 amd_pmu_event_map(int hw_event)
644{
645 return amd_perfmon_event_map[hw_event];
646}
647
648static u64 amd_pmu_raw_event(u64 hw_event)
649{
650#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
651#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
652#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
653#define K7_EVNTSEL_INV_MASK 0x000800000ULL
654#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
655
656#define K7_EVNTSEL_MASK \
657 (K7_EVNTSEL_EVENT_MASK | \
658 K7_EVNTSEL_UNIT_MASK | \
659 K7_EVNTSEL_EDGE_MASK | \
660 K7_EVNTSEL_INV_MASK | \
661 K7_EVNTSEL_REG_MASK)
662
663 return hw_event & K7_EVNTSEL_MASK;
664}
665
666/* 186/*
667 * Propagate event elapsed time into the generic event. 187 * Propagate event elapsed time into the generic event.
668 * Can only be executed on the CPU where the event is active. 188 * Can only be executed on the CPU where the event is active.
@@ -914,42 +434,6 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
914 return 0; 434 return 0;
915} 435}
916 436
917static void intel_pmu_enable_bts(u64 config)
918{
919 unsigned long debugctlmsr;
920
921 debugctlmsr = get_debugctlmsr();
922
923 debugctlmsr |= X86_DEBUGCTL_TR;
924 debugctlmsr |= X86_DEBUGCTL_BTS;
925 debugctlmsr |= X86_DEBUGCTL_BTINT;
926
927 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
928 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
929
930 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
931 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
932
933 update_debugctlmsr(debugctlmsr);
934}
935
936static void intel_pmu_disable_bts(void)
937{
938 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
939 unsigned long debugctlmsr;
940
941 if (!cpuc->ds)
942 return;
943
944 debugctlmsr = get_debugctlmsr();
945
946 debugctlmsr &=
947 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
948 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
949
950 update_debugctlmsr(debugctlmsr);
951}
952
953/* 437/*
954 * Setup the hardware configuration for a given attr_type 438 * Setup the hardware configuration for a given attr_type
955 */ 439 */
@@ -988,6 +472,8 @@ static int __hw_perf_event_init(struct perf_event *event)
988 hwc->config = ARCH_PERFMON_EVENTSEL_INT; 472 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
989 473
990 hwc->idx = -1; 474 hwc->idx = -1;
475 hwc->last_cpu = -1;
476 hwc->last_tag = ~0ULL;
991 477
992 /* 478 /*
993 * Count user and OS events unless requested not to. 479 * Count user and OS events unless requested not to.
@@ -1017,6 +503,9 @@ static int __hw_perf_event_init(struct perf_event *event)
1017 */ 503 */
1018 if (attr->type == PERF_TYPE_RAW) { 504 if (attr->type == PERF_TYPE_RAW) {
1019 hwc->config |= x86_pmu.raw_event(attr->config); 505 hwc->config |= x86_pmu.raw_event(attr->config);
506 if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
507 perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
508 return -EACCES;
1020 return 0; 509 return 0;
1021 } 510 }
1022 511
@@ -1056,216 +545,323 @@ static int __hw_perf_event_init(struct perf_event *event)
1056 return 0; 545 return 0;
1057} 546}
1058 547
1059static void p6_pmu_disable_all(void) 548static void x86_pmu_disable_all(void)
1060{ 549{
1061 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 550 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1062 u64 val; 551 int idx;
1063
1064 if (!cpuc->enabled)
1065 return;
1066 552
1067 cpuc->enabled = 0; 553 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1068 barrier(); 554 u64 val;
1069 555
1070 /* p6 only has one enable register */ 556 if (!test_bit(idx, cpuc->active_mask))
1071 rdmsrl(MSR_P6_EVNTSEL0, val); 557 continue;
1072 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 558 rdmsrl(x86_pmu.eventsel + idx, val);
1073 wrmsrl(MSR_P6_EVNTSEL0, val); 559 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
560 continue;
561 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
562 wrmsrl(x86_pmu.eventsel + idx, val);
563 }
1074} 564}
1075 565
1076static void intel_pmu_disable_all(void) 566void hw_perf_disable(void)
1077{ 567{
1078 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 568 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1079 569
570 if (!x86_pmu_initialized())
571 return;
572
1080 if (!cpuc->enabled) 573 if (!cpuc->enabled)
1081 return; 574 return;
1082 575
576 cpuc->n_added = 0;
1083 cpuc->enabled = 0; 577 cpuc->enabled = 0;
1084 barrier(); 578 barrier();
1085 579
1086 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 580 x86_pmu.disable_all();
1087
1088 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
1089 intel_pmu_disable_bts();
1090} 581}
1091 582
1092static void amd_pmu_disable_all(void) 583static void x86_pmu_enable_all(void)
1093{ 584{
1094 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 585 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1095 int idx; 586 int idx;
1096 587
1097 if (!cpuc->enabled)
1098 return;
1099
1100 cpuc->enabled = 0;
1101 /*
1102 * ensure we write the disable before we start disabling the
1103 * events proper, so that amd_pmu_enable_event() does the
1104 * right thing.
1105 */
1106 barrier();
1107
1108 for (idx = 0; idx < x86_pmu.num_events; idx++) { 588 for (idx = 0; idx < x86_pmu.num_events; idx++) {
589 struct perf_event *event = cpuc->events[idx];
1109 u64 val; 590 u64 val;
1110 591
1111 if (!test_bit(idx, cpuc->active_mask)) 592 if (!test_bit(idx, cpuc->active_mask))
1112 continue; 593 continue;
1113 rdmsrl(MSR_K7_EVNTSEL0 + idx, val); 594
1114 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) 595 val = event->hw.config;
1115 continue; 596 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
1116 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 597 wrmsrl(x86_pmu.eventsel + idx, val);
1117 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
1118 } 598 }
1119} 599}
1120 600
1121void hw_perf_disable(void) 601static const struct pmu pmu;
602
603static inline int is_x86_event(struct perf_event *event)
1122{ 604{
1123 if (!x86_pmu_initialized()) 605 return event->pmu == &pmu;
1124 return;
1125 return x86_pmu.disable_all();
1126} 606}
1127 607
1128static void p6_pmu_enable_all(void) 608static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
1129{ 609{
1130 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 610 struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
1131 unsigned long val; 611 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
612 int i, j, w, wmax, num = 0;
613 struct hw_perf_event *hwc;
1132 614
1133 if (cpuc->enabled) 615 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
1134 return;
1135 616
1136 cpuc->enabled = 1; 617 for (i = 0; i < n; i++) {
1137 barrier(); 618 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
619 constraints[i] = c;
620 }
1138 621
1139 /* p6 only has one enable register */ 622 /*
1140 rdmsrl(MSR_P6_EVNTSEL0, val); 623 * fastpath, try to reuse previous register
1141 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 624 */
1142 wrmsrl(MSR_P6_EVNTSEL0, val); 625 for (i = 0; i < n; i++) {
1143} 626 hwc = &cpuc->event_list[i]->hw;
627 c = constraints[i];
1144 628
1145static void intel_pmu_enable_all(void) 629 /* never assigned */
1146{ 630 if (hwc->idx == -1)
1147 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 631 break;
1148 632
1149 if (cpuc->enabled) 633 /* constraint still honored */
1150 return; 634 if (!test_bit(hwc->idx, c->idxmsk))
635 break;
1151 636
1152 cpuc->enabled = 1; 637 /* not already used */
1153 barrier(); 638 if (test_bit(hwc->idx, used_mask))
639 break;
640
641 set_bit(hwc->idx, used_mask);
642 if (assign)
643 assign[i] = hwc->idx;
644 }
645 if (i == n)
646 goto done;
1154 647
1155 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 648 /*
649 * begin slow path
650 */
651
652 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
1156 653
1157 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 654 /*
1158 struct perf_event *event = 655 * weight = number of possible counters
1159 cpuc->events[X86_PMC_IDX_FIXED_BTS]; 656 *
657 * 1 = most constrained, only works on one counter
658 * wmax = least constrained, works on any counter
659 *
660 * assign events to counters starting with most
661 * constrained events.
662 */
663 wmax = x86_pmu.num_events;
664
665 /*
666 * when fixed event counters are present,
667 * wmax is incremented by 1 to account
668 * for one more choice
669 */
670 if (x86_pmu.num_events_fixed)
671 wmax++;
672
673 for (w = 1, num = n; num && w <= wmax; w++) {
674 /* for each event */
675 for (i = 0; num && i < n; i++) {
676 c = constraints[i];
677 hwc = &cpuc->event_list[i]->hw;
678
679 if (c->weight != w)
680 continue;
681
682 for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
683 if (!test_bit(j, used_mask))
684 break;
685 }
1160 686
1161 if (WARN_ON_ONCE(!event)) 687 if (j == X86_PMC_IDX_MAX)
1162 return; 688 break;
1163 689
1164 intel_pmu_enable_bts(event->hw.config); 690 set_bit(j, used_mask);
691
692 if (assign)
693 assign[i] = j;
694 num--;
695 }
1165 } 696 }
697done:
698 /*
699 * scheduling failed or is just a simulation,
700 * free resources if necessary
701 */
702 if (!assign || num) {
703 for (i = 0; i < n; i++) {
704 if (x86_pmu.put_event_constraints)
705 x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
706 }
707 }
708 return num ? -ENOSPC : 0;
1166} 709}
1167 710
1168static void amd_pmu_enable_all(void) 711/*
712 * dogrp: true if must collect siblings events (group)
713 * returns total number of events and error code
714 */
715static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
1169{ 716{
1170 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 717 struct perf_event *event;
1171 int idx; 718 int n, max_count;
1172 719
1173 if (cpuc->enabled) 720 max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
1174 return;
1175 721
1176 cpuc->enabled = 1; 722 /* current number of events already accepted */
1177 barrier(); 723 n = cpuc->n_events;
1178 724
1179 for (idx = 0; idx < x86_pmu.num_events; idx++) { 725 if (is_x86_event(leader)) {
1180 struct perf_event *event = cpuc->events[idx]; 726 if (n >= max_count)
1181 u64 val; 727 return -ENOSPC;
728 cpuc->event_list[n] = leader;
729 n++;
730 }
731 if (!dogrp)
732 return n;
1182 733
1183 if (!test_bit(idx, cpuc->active_mask)) 734 list_for_each_entry(event, &leader->sibling_list, group_entry) {
735 if (!is_x86_event(event) ||
736 event->state <= PERF_EVENT_STATE_OFF)
1184 continue; 737 continue;
1185 738
1186 val = event->hw.config; 739 if (n >= max_count)
1187 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 740 return -ENOSPC;
1188 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
1189 }
1190}
1191 741
1192void hw_perf_enable(void) 742 cpuc->event_list[n] = event;
1193{ 743 n++;
1194 if (!x86_pmu_initialized()) 744 }
1195 return; 745 return n;
1196 x86_pmu.enable_all();
1197} 746}
1198 747
1199static inline u64 intel_pmu_get_status(void) 748static inline void x86_assign_hw_event(struct perf_event *event,
749 struct cpu_hw_events *cpuc, int i)
1200{ 750{
1201 u64 status; 751 struct hw_perf_event *hwc = &event->hw;
1202 752
1203 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 753 hwc->idx = cpuc->assign[i];
754 hwc->last_cpu = smp_processor_id();
755 hwc->last_tag = ++cpuc->tags[i];
1204 756
1205 return status; 757 if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
758 hwc->config_base = 0;
759 hwc->event_base = 0;
760 } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
761 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
762 /*
763 * We set it so that event_base + idx in wrmsr/rdmsr maps to
764 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
765 */
766 hwc->event_base =
767 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
768 } else {
769 hwc->config_base = x86_pmu.eventsel;
770 hwc->event_base = x86_pmu.perfctr;
771 }
1206} 772}
1207 773
1208static inline void intel_pmu_ack_status(u64 ack) 774static inline int match_prev_assignment(struct hw_perf_event *hwc,
775 struct cpu_hw_events *cpuc,
776 int i)
1209{ 777{
1210 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 778 return hwc->idx == cpuc->assign[i] &&
779 hwc->last_cpu == smp_processor_id() &&
780 hwc->last_tag == cpuc->tags[i];
1211} 781}
1212 782
1213static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) 783static void x86_pmu_stop(struct perf_event *event);
1214{
1215 (void)checking_wrmsrl(hwc->config_base + idx,
1216 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
1217}
1218 784
1219static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) 785void hw_perf_enable(void)
1220{ 786{
1221 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); 787 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1222} 788 struct perf_event *event;
789 struct hw_perf_event *hwc;
790 int i;
1223 791
1224static inline void 792 if (!x86_pmu_initialized())
1225intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) 793 return;
1226{
1227 int idx = __idx - X86_PMC_IDX_FIXED;
1228 u64 ctrl_val, mask;
1229 794
1230 mask = 0xfULL << (idx * 4); 795 if (cpuc->enabled)
796 return;
1231 797
1232 rdmsrl(hwc->config_base, ctrl_val); 798 if (cpuc->n_added) {
1233 ctrl_val &= ~mask; 799 /*
1234 (void)checking_wrmsrl(hwc->config_base, ctrl_val); 800 * apply assignment obtained either from
1235} 801 * hw_perf_group_sched_in() or x86_pmu_enable()
802 *
803 * step1: save events moving to new counters
804 * step2: reprogram moved events into new counters
805 */
806 for (i = 0; i < cpuc->n_events; i++) {
1236 807
1237static inline void 808 event = cpuc->event_list[i];
1238p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) 809 hwc = &event->hw;
1239{
1240 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1241 u64 val = P6_NOP_EVENT;
1242 810
1243 if (cpuc->enabled) 811 /*
1244 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 812 * we can avoid reprogramming counter if:
813 * - assigned same counter as last time
814 * - running on same CPU as last time
815 * - no other event has used the counter since
816 */
817 if (hwc->idx == -1 ||
818 match_prev_assignment(hwc, cpuc, i))
819 continue;
1245 820
1246 (void)checking_wrmsrl(hwc->config_base + idx, val); 821 x86_pmu_stop(event);
1247}
1248 822
1249static inline void 823 hwc->idx = -1;
1250intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) 824 }
1251{
1252 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
1253 intel_pmu_disable_bts();
1254 return;
1255 }
1256 825
1257 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 826 for (i = 0; i < cpuc->n_events; i++) {
1258 intel_pmu_disable_fixed(hwc, idx); 827
1259 return; 828 event = cpuc->event_list[i];
829 hwc = &event->hw;
830
831 if (hwc->idx == -1) {
832 x86_assign_hw_event(event, cpuc, i);
833 x86_perf_event_set_period(event, hwc, hwc->idx);
834 }
835 /*
836 * need to mark as active because x86_pmu_disable()
837 * clear active_mask and events[] yet it preserves
838 * idx
839 */
840 set_bit(hwc->idx, cpuc->active_mask);
841 cpuc->events[hwc->idx] = event;
842
843 x86_pmu.enable(hwc, hwc->idx);
844 perf_event_update_userpage(event);
845 }
846 cpuc->n_added = 0;
847 perf_events_lapic_init();
1260 } 848 }
1261 849
1262 x86_pmu_disable_event(hwc, idx); 850 cpuc->enabled = 1;
851 barrier();
852
853 x86_pmu.enable_all();
854}
855
856static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, int idx)
857{
858 (void)checking_wrmsrl(hwc->config_base + idx,
859 hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
1263} 860}
1264 861
1265static inline void 862static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1266amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1267{ 863{
1268 x86_pmu_disable_event(hwc, idx); 864 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
1269} 865}
1270 866
1271static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 867static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -1326,220 +922,60 @@ x86_perf_event_set_period(struct perf_event *event,
1326 return ret; 922 return ret;
1327} 923}
1328 924
1329static inline void 925static void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1330intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
1331{
1332 int idx = __idx - X86_PMC_IDX_FIXED;
1333 u64 ctrl_val, bits, mask;
1334 int err;
1335
1336 /*
1337 * Enable IRQ generation (0x8),
1338 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
1339 * if requested:
1340 */
1341 bits = 0x8ULL;
1342 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
1343 bits |= 0x2;
1344 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
1345 bits |= 0x1;
1346
1347 /*
1348 * ANY bit is supported in v3 and up
1349 */
1350 if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
1351 bits |= 0x4;
1352
1353 bits <<= (idx * 4);
1354 mask = 0xfULL << (idx * 4);
1355
1356 rdmsrl(hwc->config_base, ctrl_val);
1357 ctrl_val &= ~mask;
1358 ctrl_val |= bits;
1359 err = checking_wrmsrl(hwc->config_base, ctrl_val);
1360}
1361
1362static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1363{ 926{
1364 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 927 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1365 u64 val;
1366
1367 val = hwc->config;
1368 if (cpuc->enabled) 928 if (cpuc->enabled)
1369 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 929 __x86_pmu_enable_event(hwc, idx);
1370
1371 (void)checking_wrmsrl(hwc->config_base + idx, val);
1372}
1373
1374
1375static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1376{
1377 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
1378 if (!__get_cpu_var(cpu_hw_events).enabled)
1379 return;
1380
1381 intel_pmu_enable_bts(hwc->config);
1382 return;
1383 }
1384
1385 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
1386 intel_pmu_enable_fixed(hwc, idx);
1387 return;
1388 }
1389
1390 x86_pmu_enable_event(hwc, idx);
1391} 930}
1392 931
1393static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) 932/*
933 * activate a single event
934 *
935 * The event is added to the group of enabled events
936 * but only if it can be scehduled with existing events.
937 *
938 * Called with PMU disabled. If successful and return value 1,
939 * then guaranteed to call perf_enable() and hw_perf_enable()
940 */
941static int x86_pmu_enable(struct perf_event *event)
1394{ 942{
1395 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 943 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
944 struct hw_perf_event *hwc;
945 int assign[X86_PMC_IDX_MAX];
946 int n, n0, ret;
1396 947
1397 if (cpuc->enabled) 948 hwc = &event->hw;
1398 x86_pmu_enable_event(hwc, idx);
1399}
1400
1401static int fixed_mode_idx(struct hw_perf_event *hwc)
1402{
1403 unsigned int hw_event;
1404
1405 hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
1406
1407 if (unlikely((hw_event ==
1408 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
1409 (hwc->sample_period == 1)))
1410 return X86_PMC_IDX_FIXED_BTS;
1411 949
1412 if (!x86_pmu.num_events_fixed) 950 n0 = cpuc->n_events;
1413 return -1; 951 n = collect_events(cpuc, event, false);
952 if (n < 0)
953 return n;
1414 954
955 ret = x86_schedule_events(cpuc, n, assign);
956 if (ret)
957 return ret;
1415 /* 958 /*
1416 * fixed counters do not take all possible filters 959 * copy new assignment, now we know it is possible
960 * will be used by hw_perf_enable()
1417 */ 961 */
1418 if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK) 962 memcpy(cpuc->assign, assign, n*sizeof(int));
1419 return -1;
1420
1421 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1422 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1423 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
1424 return X86_PMC_IDX_FIXED_CPU_CYCLES;
1425 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
1426 return X86_PMC_IDX_FIXED_BUS_CYCLES;
1427
1428 return -1;
1429}
1430
1431/*
1432 * generic counter allocator: get next free counter
1433 */
1434static int
1435gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1436{
1437 int idx;
1438
1439 idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
1440 return idx == x86_pmu.num_events ? -1 : idx;
1441}
1442 963
1443/* 964 cpuc->n_events = n;
1444 * intel-specific counter allocator: check event constraints 965 cpuc->n_added = n - n0;
1445 */
1446static int
1447intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1448{
1449 const struct event_constraint *event_constraint;
1450 int i, code;
1451
1452 if (!event_constraints)
1453 goto skip;
1454
1455 code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
1456
1457 for_each_event_constraint(event_constraint, event_constraints) {
1458 if (code == event_constraint->code) {
1459 for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
1460 if (!test_and_set_bit(i, cpuc->used_mask))
1461 return i;
1462 }
1463 return -1;
1464 }
1465 }
1466skip:
1467 return gen_get_event_idx(cpuc, hwc);
1468}
1469
1470static int
1471x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1472{
1473 int idx;
1474
1475 idx = fixed_mode_idx(hwc);
1476 if (idx == X86_PMC_IDX_FIXED_BTS) {
1477 /* BTS is already occupied. */
1478 if (test_and_set_bit(idx, cpuc->used_mask))
1479 return -EAGAIN;
1480
1481 hwc->config_base = 0;
1482 hwc->event_base = 0;
1483 hwc->idx = idx;
1484 } else if (idx >= 0) {
1485 /*
1486 * Try to get the fixed event, if that is already taken
1487 * then try to get a generic event:
1488 */
1489 if (test_and_set_bit(idx, cpuc->used_mask))
1490 goto try_generic;
1491
1492 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1493 /*
1494 * We set it so that event_base + idx in wrmsr/rdmsr maps to
1495 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1496 */
1497 hwc->event_base =
1498 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1499 hwc->idx = idx;
1500 } else {
1501 idx = hwc->idx;
1502 /* Try to get the previous generic event again */
1503 if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
1504try_generic:
1505 idx = x86_pmu.get_event_idx(cpuc, hwc);
1506 if (idx == -1)
1507 return -EAGAIN;
1508
1509 set_bit(idx, cpuc->used_mask);
1510 hwc->idx = idx;
1511 }
1512 hwc->config_base = x86_pmu.eventsel;
1513 hwc->event_base = x86_pmu.perfctr;
1514 }
1515 966
1516 return idx; 967 return 0;
1517} 968}
1518 969
1519/* 970static int x86_pmu_start(struct perf_event *event)
1520 * Find a PMC slot for the freshly enabled / scheduled in event:
1521 */
1522static int x86_pmu_enable(struct perf_event *event)
1523{ 971{
1524 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1525 struct hw_perf_event *hwc = &event->hw; 972 struct hw_perf_event *hwc = &event->hw;
1526 int idx;
1527 973
1528 idx = x86_schedule_event(cpuc, hwc); 974 if (hwc->idx == -1)
1529 if (idx < 0) 975 return -EAGAIN;
1530 return idx;
1531 976
1532 perf_events_lapic_init(); 977 x86_perf_event_set_period(event, hwc, hwc->idx);
1533 978 x86_pmu.enable(hwc, hwc->idx);
1534 x86_pmu.disable(hwc, idx);
1535
1536 cpuc->events[idx] = event;
1537 set_bit(idx, cpuc->active_mask);
1538
1539 x86_perf_event_set_period(event, hwc, idx);
1540 x86_pmu.enable(hwc, idx);
1541
1542 perf_event_update_userpage(event);
1543 979
1544 return 0; 980 return 0;
1545} 981}
@@ -1583,7 +1019,7 @@ void perf_event_print_debug(void)
1583 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1019 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1584 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1020 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1585 } 1021 }
1586 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); 1022 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1587 1023
1588 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1024 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1589 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1025 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
@@ -1607,67 +1043,7 @@ void perf_event_print_debug(void)
1607 local_irq_restore(flags); 1043 local_irq_restore(flags);
1608} 1044}
1609 1045
1610static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) 1046static void x86_pmu_stop(struct perf_event *event)
1611{
1612 struct debug_store *ds = cpuc->ds;
1613 struct bts_record {
1614 u64 from;
1615 u64 to;
1616 u64 flags;
1617 };
1618 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
1619 struct bts_record *at, *top;
1620 struct perf_output_handle handle;
1621 struct perf_event_header header;
1622 struct perf_sample_data data;
1623 struct pt_regs regs;
1624
1625 if (!event)
1626 return;
1627
1628 if (!ds)
1629 return;
1630
1631 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
1632 top = (struct bts_record *)(unsigned long)ds->bts_index;
1633
1634 if (top <= at)
1635 return;
1636
1637 ds->bts_index = ds->bts_buffer_base;
1638
1639
1640 data.period = event->hw.last_period;
1641 data.addr = 0;
1642 data.raw = NULL;
1643 regs.ip = 0;
1644
1645 /*
1646 * Prepare a generic sample, i.e. fill in the invariant fields.
1647 * We will overwrite the from and to address before we output
1648 * the sample.
1649 */
1650 perf_prepare_sample(&header, &data, event, &regs);
1651
1652 if (perf_output_begin(&handle, event,
1653 header.size * (top - at), 1, 1))
1654 return;
1655
1656 for (; at < top; at++) {
1657 data.ip = at->from;
1658 data.addr = at->to;
1659
1660 perf_output_sample(&handle, &header, &data, event);
1661 }
1662
1663 perf_output_end(&handle);
1664
1665 /* There's new data available. */
1666 event->hw.interrupts++;
1667 event->pending_kill = POLL_IN;
1668}
1669
1670static void x86_pmu_disable(struct perf_event *event)
1671{ 1047{
1672 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1048 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1673 struct hw_perf_event *hwc = &event->hw; 1049 struct hw_perf_event *hwc = &event->hw;
@@ -1681,183 +1057,38 @@ static void x86_pmu_disable(struct perf_event *event)
1681 x86_pmu.disable(hwc, idx); 1057 x86_pmu.disable(hwc, idx);
1682 1058
1683 /* 1059 /*
1684 * Make sure the cleared pointer becomes visible before we
1685 * (potentially) free the event:
1686 */
1687 barrier();
1688
1689 /*
1690 * Drain the remaining delta count out of a event 1060 * Drain the remaining delta count out of a event
1691 * that we are disabling: 1061 * that we are disabling:
1692 */ 1062 */
1693 x86_perf_event_update(event, hwc, idx); 1063 x86_perf_event_update(event, hwc, idx);
1694 1064
1695 /* Drain the remaining BTS records. */
1696 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
1697 intel_pmu_drain_bts_buffer(cpuc);
1698
1699 cpuc->events[idx] = NULL; 1065 cpuc->events[idx] = NULL;
1700 clear_bit(idx, cpuc->used_mask);
1701
1702 perf_event_update_userpage(event);
1703}
1704
1705/*
1706 * Save and restart an expired event. Called by NMI contexts,
1707 * so it has to be careful about preempting normal event ops:
1708 */
1709static int intel_pmu_save_and_restart(struct perf_event *event)
1710{
1711 struct hw_perf_event *hwc = &event->hw;
1712 int idx = hwc->idx;
1713 int ret;
1714
1715 x86_perf_event_update(event, hwc, idx);
1716 ret = x86_perf_event_set_period(event, hwc, idx);
1717
1718 if (event->state == PERF_EVENT_STATE_ACTIVE)
1719 intel_pmu_enable_event(hwc, idx);
1720
1721 return ret;
1722}
1723
1724static void intel_pmu_reset(void)
1725{
1726 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
1727 unsigned long flags;
1728 int idx;
1729
1730 if (!x86_pmu.num_events)
1731 return;
1732
1733 local_irq_save(flags);
1734
1735 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1736
1737 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1738 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
1739 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1740 }
1741 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1742 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1743 }
1744 if (ds)
1745 ds->bts_index = ds->bts_buffer_base;
1746
1747 local_irq_restore(flags);
1748}
1749
1750static int p6_pmu_handle_irq(struct pt_regs *regs)
1751{
1752 struct perf_sample_data data;
1753 struct cpu_hw_events *cpuc;
1754 struct perf_event *event;
1755 struct hw_perf_event *hwc;
1756 int idx, handled = 0;
1757 u64 val;
1758
1759 data.addr = 0;
1760 data.raw = NULL;
1761
1762 cpuc = &__get_cpu_var(cpu_hw_events);
1763
1764 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1765 if (!test_bit(idx, cpuc->active_mask))
1766 continue;
1767
1768 event = cpuc->events[idx];
1769 hwc = &event->hw;
1770
1771 val = x86_perf_event_update(event, hwc, idx);
1772 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1773 continue;
1774
1775 /*
1776 * event overflow
1777 */
1778 handled = 1;
1779 data.period = event->hw.last_period;
1780
1781 if (!x86_perf_event_set_period(event, hwc, idx))
1782 continue;
1783
1784 if (perf_event_overflow(event, 1, &data, regs))
1785 p6_pmu_disable_event(hwc, idx);
1786 }
1787
1788 if (handled)
1789 inc_irq_stat(apic_perf_irqs);
1790
1791 return handled;
1792} 1066}
1793 1067
1794/* 1068static void x86_pmu_disable(struct perf_event *event)
1795 * This handler is triggered by the local APIC, so the APIC IRQ handling
1796 * rules apply:
1797 */
1798static int intel_pmu_handle_irq(struct pt_regs *regs)
1799{ 1069{
1800 struct perf_sample_data data; 1070 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1801 struct cpu_hw_events *cpuc; 1071 int i;
1802 int bit, loops;
1803 u64 ack, status;
1804
1805 data.addr = 0;
1806 data.raw = NULL;
1807
1808 cpuc = &__get_cpu_var(cpu_hw_events);
1809
1810 perf_disable();
1811 intel_pmu_drain_bts_buffer(cpuc);
1812 status = intel_pmu_get_status();
1813 if (!status) {
1814 perf_enable();
1815 return 0;
1816 }
1817
1818 loops = 0;
1819again:
1820 if (++loops > 100) {
1821 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
1822 perf_event_print_debug();
1823 intel_pmu_reset();
1824 perf_enable();
1825 return 1;
1826 }
1827 1072
1828 inc_irq_stat(apic_perf_irqs); 1073 x86_pmu_stop(event);
1829 ack = status;
1830 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1831 struct perf_event *event = cpuc->events[bit];
1832 1074
1833 clear_bit(bit, (unsigned long *) &status); 1075 for (i = 0; i < cpuc->n_events; i++) {
1834 if (!test_bit(bit, cpuc->active_mask)) 1076 if (event == cpuc->event_list[i]) {
1835 continue;
1836 1077
1837 if (!intel_pmu_save_and_restart(event)) 1078 if (x86_pmu.put_event_constraints)
1838 continue; 1079 x86_pmu.put_event_constraints(cpuc, event);
1839 1080
1840 data.period = event->hw.last_period; 1081 while (++i < cpuc->n_events)
1082 cpuc->event_list[i-1] = cpuc->event_list[i];
1841 1083
1842 if (perf_event_overflow(event, 1, &data, regs)) 1084 --cpuc->n_events;
1843 intel_pmu_disable_event(&event->hw, bit); 1085 break;
1086 }
1844 } 1087 }
1845 1088 perf_event_update_userpage(event);
1846 intel_pmu_ack_status(ack);
1847
1848 /*
1849 * Repeat if there is more work to be done:
1850 */
1851 status = intel_pmu_get_status();
1852 if (status)
1853 goto again;
1854
1855 perf_enable();
1856
1857 return 1;
1858} 1089}
1859 1090
1860static int amd_pmu_handle_irq(struct pt_regs *regs) 1091static int x86_pmu_handle_irq(struct pt_regs *regs)
1861{ 1092{
1862 struct perf_sample_data data; 1093 struct perf_sample_data data;
1863 struct cpu_hw_events *cpuc; 1094 struct cpu_hw_events *cpuc;
@@ -1866,8 +1097,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1866 int idx, handled = 0; 1097 int idx, handled = 0;
1867 u64 val; 1098 u64 val;
1868 1099
1869 data.addr = 0; 1100 perf_sample_data_init(&data, 0);
1870 data.raw = NULL;
1871 1101
1872 cpuc = &__get_cpu_var(cpu_hw_events); 1102 cpuc = &__get_cpu_var(cpu_hw_events);
1873 1103
@@ -1892,7 +1122,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1892 continue; 1122 continue;
1893 1123
1894 if (perf_event_overflow(event, 1, &data, regs)) 1124 if (perf_event_overflow(event, 1, &data, regs))
1895 amd_pmu_disable_event(hwc, idx); 1125 x86_pmu.disable(hwc, idx);
1896 } 1126 }
1897 1127
1898 if (handled) 1128 if (handled)
@@ -1975,194 +1205,137 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1975 .priority = 1 1205 .priority = 1
1976}; 1206};
1977 1207
1978static __initconst struct x86_pmu p6_pmu = { 1208static struct event_constraint unconstrained;
1979 .name = "p6", 1209static struct event_constraint emptyconstraint;
1980 .handle_irq = p6_pmu_handle_irq,
1981 .disable_all = p6_pmu_disable_all,
1982 .enable_all = p6_pmu_enable_all,
1983 .enable = p6_pmu_enable_event,
1984 .disable = p6_pmu_disable_event,
1985 .eventsel = MSR_P6_EVNTSEL0,
1986 .perfctr = MSR_P6_PERFCTR0,
1987 .event_map = p6_pmu_event_map,
1988 .raw_event = p6_pmu_raw_event,
1989 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
1990 .apic = 1,
1991 .max_period = (1ULL << 31) - 1,
1992 .version = 0,
1993 .num_events = 2,
1994 /*
1995 * Events have 40 bits implemented. However they are designed such
1996 * that bits [32-39] are sign extensions of bit 31. As such the
1997 * effective width of a event for P6-like PMU is 32 bits only.
1998 *
1999 * See IA-32 Intel Architecture Software developer manual Vol 3B
2000 */
2001 .event_bits = 32,
2002 .event_mask = (1ULL << 32) - 1,
2003 .get_event_idx = intel_get_event_idx,
2004};
2005 1210
2006static __initconst struct x86_pmu intel_pmu = { 1211static struct event_constraint *
2007 .name = "Intel", 1212x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
2008 .handle_irq = intel_pmu_handle_irq, 1213{
2009 .disable_all = intel_pmu_disable_all, 1214 struct event_constraint *c;
2010 .enable_all = intel_pmu_enable_all,
2011 .enable = intel_pmu_enable_event,
2012 .disable = intel_pmu_disable_event,
2013 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
2014 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
2015 .event_map = intel_pmu_event_map,
2016 .raw_event = intel_pmu_raw_event,
2017 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
2018 .apic = 1,
2019 /*
2020 * Intel PMCs cannot be accessed sanely above 32 bit width,
2021 * so we install an artificial 1<<31 period regardless of
2022 * the generic event period:
2023 */
2024 .max_period = (1ULL << 31) - 1,
2025 .enable_bts = intel_pmu_enable_bts,
2026 .disable_bts = intel_pmu_disable_bts,
2027 .get_event_idx = intel_get_event_idx,
2028};
2029 1215
2030static __initconst struct x86_pmu amd_pmu = { 1216 if (x86_pmu.event_constraints) {
2031 .name = "AMD", 1217 for_each_event_constraint(c, x86_pmu.event_constraints) {
2032 .handle_irq = amd_pmu_handle_irq, 1218 if ((event->hw.config & c->cmask) == c->code)
2033 .disable_all = amd_pmu_disable_all, 1219 return c;
2034 .enable_all = amd_pmu_enable_all, 1220 }
2035 .enable = amd_pmu_enable_event, 1221 }
2036 .disable = amd_pmu_disable_event,
2037 .eventsel = MSR_K7_EVNTSEL0,
2038 .perfctr = MSR_K7_PERFCTR0,
2039 .event_map = amd_pmu_event_map,
2040 .raw_event = amd_pmu_raw_event,
2041 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
2042 .num_events = 4,
2043 .event_bits = 48,
2044 .event_mask = (1ULL << 48) - 1,
2045 .apic = 1,
2046 /* use highest bit to detect overflow */
2047 .max_period = (1ULL << 47) - 1,
2048 .get_event_idx = gen_get_event_idx,
2049};
2050 1222
2051static __init int p6_pmu_init(void) 1223 return &unconstrained;
1224}
1225
1226static int x86_event_sched_in(struct perf_event *event,
1227 struct perf_cpu_context *cpuctx)
2052{ 1228{
2053 switch (boot_cpu_data.x86_model) { 1229 int ret = 0;
2054 case 1:
2055 case 3: /* Pentium Pro */
2056 case 5:
2057 case 6: /* Pentium II */
2058 case 7:
2059 case 8:
2060 case 11: /* Pentium III */
2061 event_constraints = intel_p6_event_constraints;
2062 break;
2063 case 9:
2064 case 13:
2065 /* Pentium M */
2066 event_constraints = intel_p6_event_constraints;
2067 break;
2068 default:
2069 pr_cont("unsupported p6 CPU model %d ",
2070 boot_cpu_data.x86_model);
2071 return -ENODEV;
2072 }
2073 1230
2074 x86_pmu = p6_pmu; 1231 event->state = PERF_EVENT_STATE_ACTIVE;
1232 event->oncpu = smp_processor_id();
1233 event->tstamp_running += event->ctx->time - event->tstamp_stopped;
2075 1234
2076 return 0; 1235 if (!is_x86_event(event))
1236 ret = event->pmu->enable(event);
1237
1238 if (!ret && !is_software_event(event))
1239 cpuctx->active_oncpu++;
1240
1241 if (!ret && event->attr.exclusive)
1242 cpuctx->exclusive = 1;
1243
1244 return ret;
2077} 1245}
2078 1246
2079static __init int intel_pmu_init(void) 1247static void x86_event_sched_out(struct perf_event *event,
1248 struct perf_cpu_context *cpuctx)
2080{ 1249{
2081 union cpuid10_edx edx; 1250 event->state = PERF_EVENT_STATE_INACTIVE;
2082 union cpuid10_eax eax; 1251 event->oncpu = -1;
2083 unsigned int unused;
2084 unsigned int ebx;
2085 int version;
2086
2087 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
2088 /* check for P6 processor family */
2089 if (boot_cpu_data.x86 == 6) {
2090 return p6_pmu_init();
2091 } else {
2092 return -ENODEV;
2093 }
2094 }
2095 1252
2096 /* 1253 if (!is_x86_event(event))
2097 * Check whether the Architectural PerfMon supports 1254 event->pmu->disable(event);
2098 * Branch Misses Retired hw_event or not.
2099 */
2100 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
2101 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
2102 return -ENODEV;
2103 1255
2104 version = eax.split.version_id; 1256 event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
2105 if (version < 2)
2106 return -ENODEV;
2107 1257
2108 x86_pmu = intel_pmu; 1258 if (!is_software_event(event))
2109 x86_pmu.version = version; 1259 cpuctx->active_oncpu--;
2110 x86_pmu.num_events = eax.split.num_events;
2111 x86_pmu.event_bits = eax.split.bit_width;
2112 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
2113 1260
2114 /* 1261 if (event->attr.exclusive || !cpuctx->active_oncpu)
2115 * Quirk: v2 perfmon does not report fixed-purpose events, so 1262 cpuctx->exclusive = 0;
2116 * assume at least 3 events: 1263}
2117 */
2118 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
2119 1264
1265/*
1266 * Called to enable a whole group of events.
1267 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
1268 * Assumes the caller has disabled interrupts and has
1269 * frozen the PMU with hw_perf_save_disable.
1270 *
1271 * called with PMU disabled. If successful and return value 1,
1272 * then guaranteed to call perf_enable() and hw_perf_enable()
1273 */
1274int hw_perf_group_sched_in(struct perf_event *leader,
1275 struct perf_cpu_context *cpuctx,
1276 struct perf_event_context *ctx)
1277{
1278 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1279 struct perf_event *sub;
1280 int assign[X86_PMC_IDX_MAX];
1281 int n0, n1, ret;
1282
1283 /* n0 = total number of events */
1284 n0 = collect_events(cpuc, leader, true);
1285 if (n0 < 0)
1286 return n0;
1287
1288 ret = x86_schedule_events(cpuc, n0, assign);
1289 if (ret)
1290 return ret;
1291
1292 ret = x86_event_sched_in(leader, cpuctx);
1293 if (ret)
1294 return ret;
1295
1296 n1 = 1;
1297 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1298 if (sub->state > PERF_EVENT_STATE_OFF) {
1299 ret = x86_event_sched_in(sub, cpuctx);
1300 if (ret)
1301 goto undo;
1302 ++n1;
1303 }
1304 }
2120 /* 1305 /*
2121 * Install the hw-cache-events table: 1306 * copy new assignment, now we know it is possible
1307 * will be used by hw_perf_enable()
2122 */ 1308 */
2123 switch (boot_cpu_data.x86_model) { 1309 memcpy(cpuc->assign, assign, n0*sizeof(int));
2124 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
2125 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
2126 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
2127 case 29: /* six-core 45 nm xeon "Dunnington" */
2128 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
2129 sizeof(hw_cache_event_ids));
2130
2131 pr_cont("Core2 events, ");
2132 event_constraints = intel_core_event_constraints;
2133 break;
2134 default:
2135 case 26:
2136 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
2137 sizeof(hw_cache_event_ids));
2138 1310
2139 event_constraints = intel_nehalem_event_constraints; 1311 cpuc->n_events = n0;
2140 pr_cont("Nehalem/Corei7 events, "); 1312 cpuc->n_added = n1;
2141 break; 1313 ctx->nr_active += n1;
2142 case 28:
2143 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
2144 sizeof(hw_cache_event_ids));
2145 1314
2146 pr_cont("Atom events, "); 1315 /*
2147 break; 1316 * 1 means successful and events are active
1317 * This is not quite true because we defer
1318 * actual activation until hw_perf_enable() but
1319 * this way we* ensure caller won't try to enable
1320 * individual events
1321 */
1322 return 1;
1323undo:
1324 x86_event_sched_out(leader, cpuctx);
1325 n0 = 1;
1326 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1327 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
1328 x86_event_sched_out(sub, cpuctx);
1329 if (++n0 == n1)
1330 break;
1331 }
2148 } 1332 }
2149 return 0; 1333 return ret;
2150} 1334}
2151 1335
2152static __init int amd_pmu_init(void) 1336#include "perf_event_amd.c"
2153{ 1337#include "perf_event_p6.c"
2154 /* Performance-monitoring supported from K7 and later: */ 1338#include "perf_event_intel.c"
2155 if (boot_cpu_data.x86 < 6)
2156 return -ENODEV;
2157
2158 x86_pmu = amd_pmu;
2159
2160 /* Events are common for all AMDs */
2161 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
2162 sizeof(hw_cache_event_ids));
2163
2164 return 0;
2165}
2166 1339
2167static void __init pmu_check_apic(void) 1340static void __init pmu_check_apic(void)
2168{ 1341{
@@ -2176,6 +1349,7 @@ static void __init pmu_check_apic(void)
2176 1349
2177void __init init_hw_perf_events(void) 1350void __init init_hw_perf_events(void)
2178{ 1351{
1352 struct event_constraint *c;
2179 int err; 1353 int err;
2180 1354
2181 pr_info("Performance Events: "); 1355 pr_info("Performance Events: ");
@@ -2220,6 +1394,20 @@ void __init init_hw_perf_events(void)
2220 perf_events_lapic_init(); 1394 perf_events_lapic_init();
2221 register_die_notifier(&perf_event_nmi_notifier); 1395 register_die_notifier(&perf_event_nmi_notifier);
2222 1396
1397 unconstrained = (struct event_constraint)
1398 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
1399 0, x86_pmu.num_events);
1400
1401 if (x86_pmu.event_constraints) {
1402 for_each_event_constraint(c, x86_pmu.event_constraints) {
1403 if (c->cmask != INTEL_ARCH_FIXED_MASK)
1404 continue;
1405
1406 c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
1407 c->weight += x86_pmu.num_events;
1408 }
1409 }
1410
2223 pr_info("... version: %d\n", x86_pmu.version); 1411 pr_info("... version: %d\n", x86_pmu.version);
2224 pr_info("... bit width: %d\n", x86_pmu.event_bits); 1412 pr_info("... bit width: %d\n", x86_pmu.event_bits);
2225 pr_info("... generic registers: %d\n", x86_pmu.num_events); 1413 pr_info("... generic registers: %d\n", x86_pmu.num_events);
@@ -2237,50 +1425,79 @@ static inline void x86_pmu_read(struct perf_event *event)
2237static const struct pmu pmu = { 1425static const struct pmu pmu = {
2238 .enable = x86_pmu_enable, 1426 .enable = x86_pmu_enable,
2239 .disable = x86_pmu_disable, 1427 .disable = x86_pmu_disable,
1428 .start = x86_pmu_start,
1429 .stop = x86_pmu_stop,
2240 .read = x86_pmu_read, 1430 .read = x86_pmu_read,
2241 .unthrottle = x86_pmu_unthrottle, 1431 .unthrottle = x86_pmu_unthrottle,
2242}; 1432};
2243 1433
2244static int 1434/*
2245validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) 1435 * validate a single event group
2246{ 1436 *
2247 struct hw_perf_event fake_event = event->hw; 1437 * validation include:
2248 1438 * - check events are compatible which each other
2249 if (event->pmu && event->pmu != &pmu) 1439 * - events do not compete for the same counter
2250 return 0; 1440 * - number of events <= number of counters
2251 1441 *
2252 return x86_schedule_event(cpuc, &fake_event) >= 0; 1442 * validation ensures the group can be loaded onto the
2253} 1443 * PMU if it was the only group available.
2254 1444 */
2255static int validate_group(struct perf_event *event) 1445static int validate_group(struct perf_event *event)
2256{ 1446{
2257 struct perf_event *sibling, *leader = event->group_leader; 1447 struct perf_event *leader = event->group_leader;
2258 struct cpu_hw_events fake_pmu; 1448 struct cpu_hw_events *fake_cpuc;
1449 int ret, n;
2259 1450
2260 memset(&fake_pmu, 0, sizeof(fake_pmu)); 1451 ret = -ENOMEM;
1452 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1453 if (!fake_cpuc)
1454 goto out;
2261 1455
2262 if (!validate_event(&fake_pmu, leader)) 1456 /*
2263 return -ENOSPC; 1457 * the event is not yet connected with its
1458 * siblings therefore we must first collect
1459 * existing siblings, then add the new event
1460 * before we can simulate the scheduling
1461 */
1462 ret = -ENOSPC;
1463 n = collect_events(fake_cpuc, leader, true);
1464 if (n < 0)
1465 goto out_free;
2264 1466
2265 list_for_each_entry(sibling, &leader->sibling_list, group_entry) { 1467 fake_cpuc->n_events = n;
2266 if (!validate_event(&fake_pmu, sibling)) 1468 n = collect_events(fake_cpuc, event, false);
2267 return -ENOSPC; 1469 if (n < 0)
2268 } 1470 goto out_free;
2269 1471
2270 if (!validate_event(&fake_pmu, event)) 1472 fake_cpuc->n_events = n;
2271 return -ENOSPC;
2272 1473
2273 return 0; 1474 ret = x86_schedule_events(fake_cpuc, n, NULL);
1475
1476out_free:
1477 kfree(fake_cpuc);
1478out:
1479 return ret;
2274} 1480}
2275 1481
2276const struct pmu *hw_perf_event_init(struct perf_event *event) 1482const struct pmu *hw_perf_event_init(struct perf_event *event)
2277{ 1483{
1484 const struct pmu *tmp;
2278 int err; 1485 int err;
2279 1486
2280 err = __hw_perf_event_init(event); 1487 err = __hw_perf_event_init(event);
2281 if (!err) { 1488 if (!err) {
1489 /*
1490 * we temporarily connect event to its pmu
1491 * such that validate_group() can classify
1492 * it as an x86 event using is_x86_event()
1493 */
1494 tmp = event->pmu;
1495 event->pmu = &pmu;
1496
2282 if (event->group_leader != event) 1497 if (event->group_leader != event)
2283 err = validate_group(event); 1498 err = validate_group(event);
1499
1500 event->pmu = tmp;
2284 } 1501 }
2285 if (err) { 1502 if (err) {
2286 if (event->destroy) 1503 if (event->destroy)
@@ -2304,7 +1521,6 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
2304 1521
2305static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); 1522static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
2306static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); 1523static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
2307static DEFINE_PER_CPU(int, in_ignored_frame);
2308 1524
2309 1525
2310static void 1526static void
@@ -2320,10 +1536,6 @@ static void backtrace_warning(void *data, char *msg)
2320 1536
2321static int backtrace_stack(void *data, char *name) 1537static int backtrace_stack(void *data, char *name)
2322{ 1538{
2323 per_cpu(in_ignored_frame, smp_processor_id()) =
2324 x86_is_stack_id(NMI_STACK, name) ||
2325 x86_is_stack_id(DEBUG_STACK, name);
2326
2327 return 0; 1539 return 0;
2328} 1540}
2329 1541
@@ -2331,9 +1543,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
2331{ 1543{
2332 struct perf_callchain_entry *entry = data; 1544 struct perf_callchain_entry *entry = data;
2333 1545
2334 if (per_cpu(in_ignored_frame, smp_processor_id()))
2335 return;
2336
2337 if (reliable) 1546 if (reliable)
2338 callchain_store(entry, addr); 1547 callchain_store(entry, addr);
2339} 1548}
@@ -2440,9 +1649,6 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
2440 1649
2441 is_user = user_mode(regs); 1650 is_user = user_mode(regs);
2442 1651
2443 if (!current || current->pid == 0)
2444 return;
2445
2446 if (is_user && current->state != TASK_RUNNING) 1652 if (is_user && current->state != TASK_RUNNING)
2447 return; 1653 return;
2448 1654
@@ -2472,4 +1678,25 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2472void hw_perf_event_setup_online(int cpu) 1678void hw_perf_event_setup_online(int cpu)
2473{ 1679{
2474 init_debug_store_on_cpu(cpu); 1680 init_debug_store_on_cpu(cpu);
1681
1682 switch (boot_cpu_data.x86_vendor) {
1683 case X86_VENDOR_AMD:
1684 amd_pmu_cpu_online(cpu);
1685 break;
1686 default:
1687 return;
1688 }
1689}
1690
1691void hw_perf_event_setup_offline(int cpu)
1692{
1693 init_debug_store_on_cpu(cpu);
1694
1695 switch (boot_cpu_data.x86_vendor) {
1696 case X86_VENDOR_AMD:
1697 amd_pmu_cpu_offline(cpu);
1698 break;
1699 default:
1700 return;
1701 }
2475} 1702}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
new file mode 100644
index 000000000000..8f3dbfda3c4f
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -0,0 +1,416 @@
1#ifdef CONFIG_CPU_SUP_AMD
2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4
5static __initconst u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX]
8 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
9{
10 [ C(L1D) ] = {
11 [ C(OP_READ) ] = {
12 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
13 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
14 },
15 [ C(OP_WRITE) ] = {
16 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
17 [ C(RESULT_MISS) ] = 0,
18 },
19 [ C(OP_PREFETCH) ] = {
20 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
21 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
22 },
23 },
24 [ C(L1I ) ] = {
25 [ C(OP_READ) ] = {
26 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
27 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
28 },
29 [ C(OP_WRITE) ] = {
30 [ C(RESULT_ACCESS) ] = -1,
31 [ C(RESULT_MISS) ] = -1,
32 },
33 [ C(OP_PREFETCH) ] = {
34 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
35 [ C(RESULT_MISS) ] = 0,
36 },
37 },
38 [ C(LL ) ] = {
39 [ C(OP_READ) ] = {
40 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
41 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
42 },
43 [ C(OP_WRITE) ] = {
44 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
45 [ C(RESULT_MISS) ] = 0,
46 },
47 [ C(OP_PREFETCH) ] = {
48 [ C(RESULT_ACCESS) ] = 0,
49 [ C(RESULT_MISS) ] = 0,
50 },
51 },
52 [ C(DTLB) ] = {
53 [ C(OP_READ) ] = {
54 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
55 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
56 },
57 [ C(OP_WRITE) ] = {
58 [ C(RESULT_ACCESS) ] = 0,
59 [ C(RESULT_MISS) ] = 0,
60 },
61 [ C(OP_PREFETCH) ] = {
62 [ C(RESULT_ACCESS) ] = 0,
63 [ C(RESULT_MISS) ] = 0,
64 },
65 },
66 [ C(ITLB) ] = {
67 [ C(OP_READ) ] = {
68 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
69 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
70 },
71 [ C(OP_WRITE) ] = {
72 [ C(RESULT_ACCESS) ] = -1,
73 [ C(RESULT_MISS) ] = -1,
74 },
75 [ C(OP_PREFETCH) ] = {
76 [ C(RESULT_ACCESS) ] = -1,
77 [ C(RESULT_MISS) ] = -1,
78 },
79 },
80 [ C(BPU ) ] = {
81 [ C(OP_READ) ] = {
82 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
83 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
84 },
85 [ C(OP_WRITE) ] = {
86 [ C(RESULT_ACCESS) ] = -1,
87 [ C(RESULT_MISS) ] = -1,
88 },
89 [ C(OP_PREFETCH) ] = {
90 [ C(RESULT_ACCESS) ] = -1,
91 [ C(RESULT_MISS) ] = -1,
92 },
93 },
94};
95
96/*
97 * AMD Performance Monitor K7 and later.
98 */
99static const u64 amd_perfmon_event_map[] =
100{
101 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
102 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
103 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
104 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
105 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
106 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
107};
108
109static u64 amd_pmu_event_map(int hw_event)
110{
111 return amd_perfmon_event_map[hw_event];
112}
113
114static u64 amd_pmu_raw_event(u64 hw_event)
115{
116#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL
117#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
118#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
119#define K7_EVNTSEL_INV_MASK 0x000800000ULL
120#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
121
122#define K7_EVNTSEL_MASK \
123 (K7_EVNTSEL_EVENT_MASK | \
124 K7_EVNTSEL_UNIT_MASK | \
125 K7_EVNTSEL_EDGE_MASK | \
126 K7_EVNTSEL_INV_MASK | \
127 K7_EVNTSEL_REG_MASK)
128
129 return hw_event & K7_EVNTSEL_MASK;
130}
131
132/*
133 * AMD64 events are detected based on their event codes.
134 */
135static inline int amd_is_nb_event(struct hw_perf_event *hwc)
136{
137 return (hwc->config & 0xe0) == 0xe0;
138}
139
140static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
141 struct perf_event *event)
142{
143 struct hw_perf_event *hwc = &event->hw;
144 struct amd_nb *nb = cpuc->amd_nb;
145 int i;
146
147 /*
148 * only care about NB events
149 */
150 if (!(nb && amd_is_nb_event(hwc)))
151 return;
152
153 /*
154 * need to scan whole list because event may not have
155 * been assigned during scheduling
156 *
157 * no race condition possible because event can only
158 * be removed on one CPU at a time AND PMU is disabled
159 * when we come here
160 */
161 for (i = 0; i < x86_pmu.num_events; i++) {
162 if (nb->owners[i] == event) {
163 cmpxchg(nb->owners+i, event, NULL);
164 break;
165 }
166 }
167}
168
169 /*
170 * AMD64 NorthBridge events need special treatment because
171 * counter access needs to be synchronized across all cores
172 * of a package. Refer to BKDG section 3.12
173 *
174 * NB events are events measuring L3 cache, Hypertransport
175 * traffic. They are identified by an event code >= 0xe00.
176 * They measure events on the NorthBride which is shared
177 * by all cores on a package. NB events are counted on a
178 * shared set of counters. When a NB event is programmed
179 * in a counter, the data actually comes from a shared
180 * counter. Thus, access to those counters needs to be
181 * synchronized.
182 *
183 * We implement the synchronization such that no two cores
184 * can be measuring NB events using the same counters. Thus,
185 * we maintain a per-NB allocation table. The available slot
186 * is propagated using the event_constraint structure.
187 *
188 * We provide only one choice for each NB event based on
189 * the fact that only NB events have restrictions. Consequently,
190 * if a counter is available, there is a guarantee the NB event
191 * will be assigned to it. If no slot is available, an empty
192 * constraint is returned and scheduling will eventually fail
193 * for this event.
194 *
195 * Note that all cores attached the same NB compete for the same
196 * counters to host NB events, this is why we use atomic ops. Some
197 * multi-chip CPUs may have more than one NB.
198 *
199 * Given that resources are allocated (cmpxchg), they must be
200 * eventually freed for others to use. This is accomplished by
201 * calling amd_put_event_constraints().
202 *
203 * Non NB events are not impacted by this restriction.
204 */
205static struct event_constraint *
206amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
207{
208 struct hw_perf_event *hwc = &event->hw;
209 struct amd_nb *nb = cpuc->amd_nb;
210 struct perf_event *old = NULL;
211 int max = x86_pmu.num_events;
212 int i, j, k = -1;
213
214 /*
215 * if not NB event or no NB, then no constraints
216 */
217 if (!(nb && amd_is_nb_event(hwc)))
218 return &unconstrained;
219
220 /*
221 * detect if already present, if so reuse
222 *
223 * cannot merge with actual allocation
224 * because of possible holes
225 *
226 * event can already be present yet not assigned (in hwc->idx)
227 * because of successive calls to x86_schedule_events() from
228 * hw_perf_group_sched_in() without hw_perf_enable()
229 */
230 for (i = 0; i < max; i++) {
231 /*
232 * keep track of first free slot
233 */
234 if (k == -1 && !nb->owners[i])
235 k = i;
236
237 /* already present, reuse */
238 if (nb->owners[i] == event)
239 goto done;
240 }
241 /*
242 * not present, so grab a new slot
243 * starting either at:
244 */
245 if (hwc->idx != -1) {
246 /* previous assignment */
247 i = hwc->idx;
248 } else if (k != -1) {
249 /* start from free slot found */
250 i = k;
251 } else {
252 /*
253 * event not found, no slot found in
254 * first pass, try again from the
255 * beginning
256 */
257 i = 0;
258 }
259 j = i;
260 do {
261 old = cmpxchg(nb->owners+i, NULL, event);
262 if (!old)
263 break;
264 if (++i == max)
265 i = 0;
266 } while (i != j);
267done:
268 if (!old)
269 return &nb->event_constraints[i];
270
271 return &emptyconstraint;
272}
273
274static __initconst struct x86_pmu amd_pmu = {
275 .name = "AMD",
276 .handle_irq = x86_pmu_handle_irq,
277 .disable_all = x86_pmu_disable_all,
278 .enable_all = x86_pmu_enable_all,
279 .enable = x86_pmu_enable_event,
280 .disable = x86_pmu_disable_event,
281 .eventsel = MSR_K7_EVNTSEL0,
282 .perfctr = MSR_K7_PERFCTR0,
283 .event_map = amd_pmu_event_map,
284 .raw_event = amd_pmu_raw_event,
285 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
286 .num_events = 4,
287 .event_bits = 48,
288 .event_mask = (1ULL << 48) - 1,
289 .apic = 1,
290 /* use highest bit to detect overflow */
291 .max_period = (1ULL << 47) - 1,
292 .get_event_constraints = amd_get_event_constraints,
293 .put_event_constraints = amd_put_event_constraints
294};
295
296static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
297{
298 struct amd_nb *nb;
299 int i;
300
301 nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL);
302 if (!nb)
303 return NULL;
304
305 memset(nb, 0, sizeof(*nb));
306 nb->nb_id = nb_id;
307
308 /*
309 * initialize all possible NB constraints
310 */
311 for (i = 0; i < x86_pmu.num_events; i++) {
312 set_bit(i, nb->event_constraints[i].idxmsk);
313 nb->event_constraints[i].weight = 1;
314 }
315 return nb;
316}
317
318static void amd_pmu_cpu_online(int cpu)
319{
320 struct cpu_hw_events *cpu1, *cpu2;
321 struct amd_nb *nb = NULL;
322 int i, nb_id;
323
324 if (boot_cpu_data.x86_max_cores < 2)
325 return;
326
327 /*
328 * function may be called too early in the
329 * boot process, in which case nb_id is bogus
330 */
331 nb_id = amd_get_nb_id(cpu);
332 if (nb_id == BAD_APICID)
333 return;
334
335 cpu1 = &per_cpu(cpu_hw_events, cpu);
336 cpu1->amd_nb = NULL;
337
338 raw_spin_lock(&amd_nb_lock);
339
340 for_each_online_cpu(i) {
341 cpu2 = &per_cpu(cpu_hw_events, i);
342 nb = cpu2->amd_nb;
343 if (!nb)
344 continue;
345 if (nb->nb_id == nb_id)
346 goto found;
347 }
348
349 nb = amd_alloc_nb(cpu, nb_id);
350 if (!nb) {
351 pr_err("perf_events: failed NB allocation for CPU%d\n", cpu);
352 raw_spin_unlock(&amd_nb_lock);
353 return;
354 }
355found:
356 nb->refcnt++;
357 cpu1->amd_nb = nb;
358
359 raw_spin_unlock(&amd_nb_lock);
360}
361
362static void amd_pmu_cpu_offline(int cpu)
363{
364 struct cpu_hw_events *cpuhw;
365
366 if (boot_cpu_data.x86_max_cores < 2)
367 return;
368
369 cpuhw = &per_cpu(cpu_hw_events, cpu);
370
371 raw_spin_lock(&amd_nb_lock);
372
373 if (--cpuhw->amd_nb->refcnt == 0)
374 kfree(cpuhw->amd_nb);
375
376 cpuhw->amd_nb = NULL;
377
378 raw_spin_unlock(&amd_nb_lock);
379}
380
381static __init int amd_pmu_init(void)
382{
383 /* Performance-monitoring supported from K7 and later: */
384 if (boot_cpu_data.x86 < 6)
385 return -ENODEV;
386
387 x86_pmu = amd_pmu;
388
389 /* Events are common for all AMDs */
390 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
391 sizeof(hw_cache_event_ids));
392
393 /*
394 * explicitly initialize the boot cpu, other cpus will get
395 * the cpu hotplug callbacks from smp_init()
396 */
397 amd_pmu_cpu_online(smp_processor_id());
398 return 0;
399}
400
401#else /* CONFIG_CPU_SUP_AMD */
402
403static int amd_pmu_init(void)
404{
405 return 0;
406}
407
408static void amd_pmu_cpu_online(int cpu)
409{
410}
411
412static void amd_pmu_cpu_offline(int cpu)
413{
414}
415
416#endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
new file mode 100644
index 000000000000..44b60c852107
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -0,0 +1,980 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/*
4 * Intel PerfMon, used on Core and later.
5 */
6static const u64 intel_perfmon_event_map[] =
7{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
10 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
11 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
12 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
13 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
14 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
15};
16
17static struct event_constraint intel_core_event_constraints[] =
18{
19 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
20 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
21 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
22 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
23 INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
24 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
25 EVENT_CONSTRAINT_END
26};
27
28static struct event_constraint intel_core2_event_constraints[] =
29{
30 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
31 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
32 /*
33 * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
34 * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
35 * ratio between these counters.
36 */
37 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
38 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
39 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
40 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
41 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
42 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
43 INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
44 INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
45 INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
46 INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
47 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
48 EVENT_CONSTRAINT_END
49};
50
51static struct event_constraint intel_nehalem_event_constraints[] =
52{
53 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
54 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
55 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
56 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
57 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
58 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
59 INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
60 INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
61 INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
62 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
63 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
64 EVENT_CONSTRAINT_END
65};
66
67static struct event_constraint intel_westmere_event_constraints[] =
68{
69 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
70 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
71 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
72 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
73 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
74 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
75 EVENT_CONSTRAINT_END
76};
77
78static struct event_constraint intel_gen_event_constraints[] =
79{
80 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
81 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
82 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
83 EVENT_CONSTRAINT_END
84};
85
86static u64 intel_pmu_event_map(int hw_event)
87{
88 return intel_perfmon_event_map[hw_event];
89}
90
91static __initconst u64 westmere_hw_cache_event_ids
92 [PERF_COUNT_HW_CACHE_MAX]
93 [PERF_COUNT_HW_CACHE_OP_MAX]
94 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
95{
96 [ C(L1D) ] = {
97 [ C(OP_READ) ] = {
98 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
99 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
100 },
101 [ C(OP_WRITE) ] = {
102 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
103 [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
104 },
105 [ C(OP_PREFETCH) ] = {
106 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
107 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
108 },
109 },
110 [ C(L1I ) ] = {
111 [ C(OP_READ) ] = {
112 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
113 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
114 },
115 [ C(OP_WRITE) ] = {
116 [ C(RESULT_ACCESS) ] = -1,
117 [ C(RESULT_MISS) ] = -1,
118 },
119 [ C(OP_PREFETCH) ] = {
120 [ C(RESULT_ACCESS) ] = 0x0,
121 [ C(RESULT_MISS) ] = 0x0,
122 },
123 },
124 [ C(LL ) ] = {
125 [ C(OP_READ) ] = {
126 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
127 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
128 },
129 [ C(OP_WRITE) ] = {
130 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
131 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
132 },
133 [ C(OP_PREFETCH) ] = {
134 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
135 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
136 },
137 },
138 [ C(DTLB) ] = {
139 [ C(OP_READ) ] = {
140 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
141 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
142 },
143 [ C(OP_WRITE) ] = {
144 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
145 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
146 },
147 [ C(OP_PREFETCH) ] = {
148 [ C(RESULT_ACCESS) ] = 0x0,
149 [ C(RESULT_MISS) ] = 0x0,
150 },
151 },
152 [ C(ITLB) ] = {
153 [ C(OP_READ) ] = {
154 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
155 [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */
156 },
157 [ C(OP_WRITE) ] = {
158 [ C(RESULT_ACCESS) ] = -1,
159 [ C(RESULT_MISS) ] = -1,
160 },
161 [ C(OP_PREFETCH) ] = {
162 [ C(RESULT_ACCESS) ] = -1,
163 [ C(RESULT_MISS) ] = -1,
164 },
165 },
166 [ C(BPU ) ] = {
167 [ C(OP_READ) ] = {
168 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
169 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
170 },
171 [ C(OP_WRITE) ] = {
172 [ C(RESULT_ACCESS) ] = -1,
173 [ C(RESULT_MISS) ] = -1,
174 },
175 [ C(OP_PREFETCH) ] = {
176 [ C(RESULT_ACCESS) ] = -1,
177 [ C(RESULT_MISS) ] = -1,
178 },
179 },
180};
181
182static __initconst u64 nehalem_hw_cache_event_ids
183 [PERF_COUNT_HW_CACHE_MAX]
184 [PERF_COUNT_HW_CACHE_OP_MAX]
185 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
186{
187 [ C(L1D) ] = {
188 [ C(OP_READ) ] = {
189 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
190 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
191 },
192 [ C(OP_WRITE) ] = {
193 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
194 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
195 },
196 [ C(OP_PREFETCH) ] = {
197 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
198 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
199 },
200 },
201 [ C(L1I ) ] = {
202 [ C(OP_READ) ] = {
203 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
204 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
205 },
206 [ C(OP_WRITE) ] = {
207 [ C(RESULT_ACCESS) ] = -1,
208 [ C(RESULT_MISS) ] = -1,
209 },
210 [ C(OP_PREFETCH) ] = {
211 [ C(RESULT_ACCESS) ] = 0x0,
212 [ C(RESULT_MISS) ] = 0x0,
213 },
214 },
215 [ C(LL ) ] = {
216 [ C(OP_READ) ] = {
217 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
218 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
219 },
220 [ C(OP_WRITE) ] = {
221 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
222 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
223 },
224 [ C(OP_PREFETCH) ] = {
225 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
226 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
227 },
228 },
229 [ C(DTLB) ] = {
230 [ C(OP_READ) ] = {
231 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
232 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
233 },
234 [ C(OP_WRITE) ] = {
235 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
236 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
237 },
238 [ C(OP_PREFETCH) ] = {
239 [ C(RESULT_ACCESS) ] = 0x0,
240 [ C(RESULT_MISS) ] = 0x0,
241 },
242 },
243 [ C(ITLB) ] = {
244 [ C(OP_READ) ] = {
245 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
246 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
247 },
248 [ C(OP_WRITE) ] = {
249 [ C(RESULT_ACCESS) ] = -1,
250 [ C(RESULT_MISS) ] = -1,
251 },
252 [ C(OP_PREFETCH) ] = {
253 [ C(RESULT_ACCESS) ] = -1,
254 [ C(RESULT_MISS) ] = -1,
255 },
256 },
257 [ C(BPU ) ] = {
258 [ C(OP_READ) ] = {
259 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
260 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
261 },
262 [ C(OP_WRITE) ] = {
263 [ C(RESULT_ACCESS) ] = -1,
264 [ C(RESULT_MISS) ] = -1,
265 },
266 [ C(OP_PREFETCH) ] = {
267 [ C(RESULT_ACCESS) ] = -1,
268 [ C(RESULT_MISS) ] = -1,
269 },
270 },
271};
272
273static __initconst u64 core2_hw_cache_event_ids
274 [PERF_COUNT_HW_CACHE_MAX]
275 [PERF_COUNT_HW_CACHE_OP_MAX]
276 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
277{
278 [ C(L1D) ] = {
279 [ C(OP_READ) ] = {
280 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
281 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
282 },
283 [ C(OP_WRITE) ] = {
284 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
285 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
286 },
287 [ C(OP_PREFETCH) ] = {
288 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
289 [ C(RESULT_MISS) ] = 0,
290 },
291 },
292 [ C(L1I ) ] = {
293 [ C(OP_READ) ] = {
294 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
295 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
296 },
297 [ C(OP_WRITE) ] = {
298 [ C(RESULT_ACCESS) ] = -1,
299 [ C(RESULT_MISS) ] = -1,
300 },
301 [ C(OP_PREFETCH) ] = {
302 [ C(RESULT_ACCESS) ] = 0,
303 [ C(RESULT_MISS) ] = 0,
304 },
305 },
306 [ C(LL ) ] = {
307 [ C(OP_READ) ] = {
308 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
309 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
310 },
311 [ C(OP_WRITE) ] = {
312 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
313 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
314 },
315 [ C(OP_PREFETCH) ] = {
316 [ C(RESULT_ACCESS) ] = 0,
317 [ C(RESULT_MISS) ] = 0,
318 },
319 },
320 [ C(DTLB) ] = {
321 [ C(OP_READ) ] = {
322 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
323 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
324 },
325 [ C(OP_WRITE) ] = {
326 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
327 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
328 },
329 [ C(OP_PREFETCH) ] = {
330 [ C(RESULT_ACCESS) ] = 0,
331 [ C(RESULT_MISS) ] = 0,
332 },
333 },
334 [ C(ITLB) ] = {
335 [ C(OP_READ) ] = {
336 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
337 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
338 },
339 [ C(OP_WRITE) ] = {
340 [ C(RESULT_ACCESS) ] = -1,
341 [ C(RESULT_MISS) ] = -1,
342 },
343 [ C(OP_PREFETCH) ] = {
344 [ C(RESULT_ACCESS) ] = -1,
345 [ C(RESULT_MISS) ] = -1,
346 },
347 },
348 [ C(BPU ) ] = {
349 [ C(OP_READ) ] = {
350 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
351 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
352 },
353 [ C(OP_WRITE) ] = {
354 [ C(RESULT_ACCESS) ] = -1,
355 [ C(RESULT_MISS) ] = -1,
356 },
357 [ C(OP_PREFETCH) ] = {
358 [ C(RESULT_ACCESS) ] = -1,
359 [ C(RESULT_MISS) ] = -1,
360 },
361 },
362};
363
364static __initconst u64 atom_hw_cache_event_ids
365 [PERF_COUNT_HW_CACHE_MAX]
366 [PERF_COUNT_HW_CACHE_OP_MAX]
367 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
368{
369 [ C(L1D) ] = {
370 [ C(OP_READ) ] = {
371 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
372 [ C(RESULT_MISS) ] = 0,
373 },
374 [ C(OP_WRITE) ] = {
375 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
376 [ C(RESULT_MISS) ] = 0,
377 },
378 [ C(OP_PREFETCH) ] = {
379 [ C(RESULT_ACCESS) ] = 0x0,
380 [ C(RESULT_MISS) ] = 0,
381 },
382 },
383 [ C(L1I ) ] = {
384 [ C(OP_READ) ] = {
385 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
386 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
387 },
388 [ C(OP_WRITE) ] = {
389 [ C(RESULT_ACCESS) ] = -1,
390 [ C(RESULT_MISS) ] = -1,
391 },
392 [ C(OP_PREFETCH) ] = {
393 [ C(RESULT_ACCESS) ] = 0,
394 [ C(RESULT_MISS) ] = 0,
395 },
396 },
397 [ C(LL ) ] = {
398 [ C(OP_READ) ] = {
399 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
400 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
401 },
402 [ C(OP_WRITE) ] = {
403 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
404 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
405 },
406 [ C(OP_PREFETCH) ] = {
407 [ C(RESULT_ACCESS) ] = 0,
408 [ C(RESULT_MISS) ] = 0,
409 },
410 },
411 [ C(DTLB) ] = {
412 [ C(OP_READ) ] = {
413 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
414 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
415 },
416 [ C(OP_WRITE) ] = {
417 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
418 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
419 },
420 [ C(OP_PREFETCH) ] = {
421 [ C(RESULT_ACCESS) ] = 0,
422 [ C(RESULT_MISS) ] = 0,
423 },
424 },
425 [ C(ITLB) ] = {
426 [ C(OP_READ) ] = {
427 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
428 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
429 },
430 [ C(OP_WRITE) ] = {
431 [ C(RESULT_ACCESS) ] = -1,
432 [ C(RESULT_MISS) ] = -1,
433 },
434 [ C(OP_PREFETCH) ] = {
435 [ C(RESULT_ACCESS) ] = -1,
436 [ C(RESULT_MISS) ] = -1,
437 },
438 },
439 [ C(BPU ) ] = {
440 [ C(OP_READ) ] = {
441 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
442 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
443 },
444 [ C(OP_WRITE) ] = {
445 [ C(RESULT_ACCESS) ] = -1,
446 [ C(RESULT_MISS) ] = -1,
447 },
448 [ C(OP_PREFETCH) ] = {
449 [ C(RESULT_ACCESS) ] = -1,
450 [ C(RESULT_MISS) ] = -1,
451 },
452 },
453};
454
455static u64 intel_pmu_raw_event(u64 hw_event)
456{
457#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
458#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
459#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
460#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
461#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
462
463#define CORE_EVNTSEL_MASK \
464 (INTEL_ARCH_EVTSEL_MASK | \
465 INTEL_ARCH_UNIT_MASK | \
466 INTEL_ARCH_EDGE_MASK | \
467 INTEL_ARCH_INV_MASK | \
468 INTEL_ARCH_CNT_MASK)
469
470 return hw_event & CORE_EVNTSEL_MASK;
471}
472
473static void intel_pmu_enable_bts(u64 config)
474{
475 unsigned long debugctlmsr;
476
477 debugctlmsr = get_debugctlmsr();
478
479 debugctlmsr |= X86_DEBUGCTL_TR;
480 debugctlmsr |= X86_DEBUGCTL_BTS;
481 debugctlmsr |= X86_DEBUGCTL_BTINT;
482
483 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
484 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
485
486 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
487 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
488
489 update_debugctlmsr(debugctlmsr);
490}
491
492static void intel_pmu_disable_bts(void)
493{
494 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
495 unsigned long debugctlmsr;
496
497 if (!cpuc->ds)
498 return;
499
500 debugctlmsr = get_debugctlmsr();
501
502 debugctlmsr &=
503 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
504 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
505
506 update_debugctlmsr(debugctlmsr);
507}
508
509static void intel_pmu_disable_all(void)
510{
511 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
512
513 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
514
515 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
516 intel_pmu_disable_bts();
517}
518
519static void intel_pmu_enable_all(void)
520{
521 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
522
523 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
524
525 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
526 struct perf_event *event =
527 cpuc->events[X86_PMC_IDX_FIXED_BTS];
528
529 if (WARN_ON_ONCE(!event))
530 return;
531
532 intel_pmu_enable_bts(event->hw.config);
533 }
534}
535
536static inline u64 intel_pmu_get_status(void)
537{
538 u64 status;
539
540 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
541
542 return status;
543}
544
545static inline void intel_pmu_ack_status(u64 ack)
546{
547 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
548}
549
550static inline void
551intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx)
552{
553 int idx = __idx - X86_PMC_IDX_FIXED;
554 u64 ctrl_val, mask;
555
556 mask = 0xfULL << (idx * 4);
557
558 rdmsrl(hwc->config_base, ctrl_val);
559 ctrl_val &= ~mask;
560 (void)checking_wrmsrl(hwc->config_base, ctrl_val);
561}
562
563static void intel_pmu_drain_bts_buffer(void)
564{
565 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
566 struct debug_store *ds = cpuc->ds;
567 struct bts_record {
568 u64 from;
569 u64 to;
570 u64 flags;
571 };
572 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
573 struct bts_record *at, *top;
574 struct perf_output_handle handle;
575 struct perf_event_header header;
576 struct perf_sample_data data;
577 struct pt_regs regs;
578
579 if (!event)
580 return;
581
582 if (!ds)
583 return;
584
585 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
586 top = (struct bts_record *)(unsigned long)ds->bts_index;
587
588 if (top <= at)
589 return;
590
591 ds->bts_index = ds->bts_buffer_base;
592
593 perf_sample_data_init(&data, 0);
594
595 data.period = event->hw.last_period;
596 regs.ip = 0;
597
598 /*
599 * Prepare a generic sample, i.e. fill in the invariant fields.
600 * We will overwrite the from and to address before we output
601 * the sample.
602 */
603 perf_prepare_sample(&header, &data, event, &regs);
604
605 if (perf_output_begin(&handle, event,
606 header.size * (top - at), 1, 1))
607 return;
608
609 for (; at < top; at++) {
610 data.ip = at->from;
611 data.addr = at->to;
612
613 perf_output_sample(&handle, &header, &data, event);
614 }
615
616 perf_output_end(&handle);
617
618 /* There's new data available. */
619 event->hw.interrupts++;
620 event->pending_kill = POLL_IN;
621}
622
623static inline void
624intel_pmu_disable_event(struct hw_perf_event *hwc, int idx)
625{
626 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
627 intel_pmu_disable_bts();
628 intel_pmu_drain_bts_buffer();
629 return;
630 }
631
632 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
633 intel_pmu_disable_fixed(hwc, idx);
634 return;
635 }
636
637 x86_pmu_disable_event(hwc, idx);
638}
639
640static inline void
641intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
642{
643 int idx = __idx - X86_PMC_IDX_FIXED;
644 u64 ctrl_val, bits, mask;
645 int err;
646
647 /*
648 * Enable IRQ generation (0x8),
649 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
650 * if requested:
651 */
652 bits = 0x8ULL;
653 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
654 bits |= 0x2;
655 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
656 bits |= 0x1;
657
658 /*
659 * ANY bit is supported in v3 and up
660 */
661 if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
662 bits |= 0x4;
663
664 bits <<= (idx * 4);
665 mask = 0xfULL << (idx * 4);
666
667 rdmsrl(hwc->config_base, ctrl_val);
668 ctrl_val &= ~mask;
669 ctrl_val |= bits;
670 err = checking_wrmsrl(hwc->config_base, ctrl_val);
671}
672
673static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
674{
675 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
676 if (!__get_cpu_var(cpu_hw_events).enabled)
677 return;
678
679 intel_pmu_enable_bts(hwc->config);
680 return;
681 }
682
683 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
684 intel_pmu_enable_fixed(hwc, idx);
685 return;
686 }
687
688 __x86_pmu_enable_event(hwc, idx);
689}
690
691/*
692 * Save and restart an expired event. Called by NMI contexts,
693 * so it has to be careful about preempting normal event ops:
694 */
695static int intel_pmu_save_and_restart(struct perf_event *event)
696{
697 struct hw_perf_event *hwc = &event->hw;
698 int idx = hwc->idx;
699 int ret;
700
701 x86_perf_event_update(event, hwc, idx);
702 ret = x86_perf_event_set_period(event, hwc, idx);
703
704 return ret;
705}
706
707static void intel_pmu_reset(void)
708{
709 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
710 unsigned long flags;
711 int idx;
712
713 if (!x86_pmu.num_events)
714 return;
715
716 local_irq_save(flags);
717
718 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
719
720 for (idx = 0; idx < x86_pmu.num_events; idx++) {
721 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
722 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
723 }
724 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
725 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
726 }
727 if (ds)
728 ds->bts_index = ds->bts_buffer_base;
729
730 local_irq_restore(flags);
731}
732
733/*
734 * This handler is triggered by the local APIC, so the APIC IRQ handling
735 * rules apply:
736 */
737static int intel_pmu_handle_irq(struct pt_regs *regs)
738{
739 struct perf_sample_data data;
740 struct cpu_hw_events *cpuc;
741 int bit, loops;
742 u64 ack, status;
743
744 perf_sample_data_init(&data, 0);
745
746 cpuc = &__get_cpu_var(cpu_hw_events);
747
748 perf_disable();
749 intel_pmu_drain_bts_buffer();
750 status = intel_pmu_get_status();
751 if (!status) {
752 perf_enable();
753 return 0;
754 }
755
756 loops = 0;
757again:
758 if (++loops > 100) {
759 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
760 perf_event_print_debug();
761 intel_pmu_reset();
762 perf_enable();
763 return 1;
764 }
765
766 inc_irq_stat(apic_perf_irqs);
767 ack = status;
768 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
769 struct perf_event *event = cpuc->events[bit];
770
771 clear_bit(bit, (unsigned long *) &status);
772 if (!test_bit(bit, cpuc->active_mask))
773 continue;
774
775 if (!intel_pmu_save_and_restart(event))
776 continue;
777
778 data.period = event->hw.last_period;
779
780 if (perf_event_overflow(event, 1, &data, regs))
781 intel_pmu_disable_event(&event->hw, bit);
782 }
783
784 intel_pmu_ack_status(ack);
785
786 /*
787 * Repeat if there is more work to be done:
788 */
789 status = intel_pmu_get_status();
790 if (status)
791 goto again;
792
793 perf_enable();
794
795 return 1;
796}
797
798static struct event_constraint bts_constraint =
799 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
800
801static struct event_constraint *
802intel_special_constraints(struct perf_event *event)
803{
804 unsigned int hw_event;
805
806 hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
807
808 if (unlikely((hw_event ==
809 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
810 (event->hw.sample_period == 1))) {
811
812 return &bts_constraint;
813 }
814 return NULL;
815}
816
817static struct event_constraint *
818intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
819{
820 struct event_constraint *c;
821
822 c = intel_special_constraints(event);
823 if (c)
824 return c;
825
826 return x86_get_event_constraints(cpuc, event);
827}
828
829static __initconst struct x86_pmu core_pmu = {
830 .name = "core",
831 .handle_irq = x86_pmu_handle_irq,
832 .disable_all = x86_pmu_disable_all,
833 .enable_all = x86_pmu_enable_all,
834 .enable = x86_pmu_enable_event,
835 .disable = x86_pmu_disable_event,
836 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
837 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
838 .event_map = intel_pmu_event_map,
839 .raw_event = intel_pmu_raw_event,
840 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
841 .apic = 1,
842 /*
843 * Intel PMCs cannot be accessed sanely above 32 bit width,
844 * so we install an artificial 1<<31 period regardless of
845 * the generic event period:
846 */
847 .max_period = (1ULL << 31) - 1,
848 .get_event_constraints = intel_get_event_constraints,
849 .event_constraints = intel_core_event_constraints,
850};
851
852static __initconst struct x86_pmu intel_pmu = {
853 .name = "Intel",
854 .handle_irq = intel_pmu_handle_irq,
855 .disable_all = intel_pmu_disable_all,
856 .enable_all = intel_pmu_enable_all,
857 .enable = intel_pmu_enable_event,
858 .disable = intel_pmu_disable_event,
859 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
860 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
861 .event_map = intel_pmu_event_map,
862 .raw_event = intel_pmu_raw_event,
863 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
864 .apic = 1,
865 /*
866 * Intel PMCs cannot be accessed sanely above 32 bit width,
867 * so we install an artificial 1<<31 period regardless of
868 * the generic event period:
869 */
870 .max_period = (1ULL << 31) - 1,
871 .enable_bts = intel_pmu_enable_bts,
872 .disable_bts = intel_pmu_disable_bts,
873 .get_event_constraints = intel_get_event_constraints
874};
875
876static __init int intel_pmu_init(void)
877{
878 union cpuid10_edx edx;
879 union cpuid10_eax eax;
880 unsigned int unused;
881 unsigned int ebx;
882 int version;
883
884 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
885 /* check for P6 processor family */
886 if (boot_cpu_data.x86 == 6) {
887 return p6_pmu_init();
888 } else {
889 return -ENODEV;
890 }
891 }
892
893 /*
894 * Check whether the Architectural PerfMon supports
895 * Branch Misses Retired hw_event or not.
896 */
897 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
898 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
899 return -ENODEV;
900
901 version = eax.split.version_id;
902 if (version < 2)
903 x86_pmu = core_pmu;
904 else
905 x86_pmu = intel_pmu;
906
907 x86_pmu.version = version;
908 x86_pmu.num_events = eax.split.num_events;
909 x86_pmu.event_bits = eax.split.bit_width;
910 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
911
912 /*
913 * Quirk: v2 perfmon does not report fixed-purpose events, so
914 * assume at least 3 events:
915 */
916 if (version > 1)
917 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
918
919 /*
920 * Install the hw-cache-events table:
921 */
922 switch (boot_cpu_data.x86_model) {
923 case 14: /* 65 nm core solo/duo, "Yonah" */
924 pr_cont("Core events, ");
925 break;
926
927 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
928 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
929 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
930 case 29: /* six-core 45 nm xeon "Dunnington" */
931 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
932 sizeof(hw_cache_event_ids));
933
934 x86_pmu.event_constraints = intel_core2_event_constraints;
935 pr_cont("Core2 events, ");
936 break;
937
938 case 26: /* 45 nm nehalem, "Bloomfield" */
939 case 30: /* 45 nm nehalem, "Lynnfield" */
940 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
941 sizeof(hw_cache_event_ids));
942
943 x86_pmu.event_constraints = intel_nehalem_event_constraints;
944 pr_cont("Nehalem/Corei7 events, ");
945 break;
946 case 28: /* Atom */
947 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
948 sizeof(hw_cache_event_ids));
949
950 x86_pmu.event_constraints = intel_gen_event_constraints;
951 pr_cont("Atom events, ");
952 break;
953
954 case 37: /* 32 nm nehalem, "Clarkdale" */
955 case 44: /* 32 nm nehalem, "Gulftown" */
956 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
957 sizeof(hw_cache_event_ids));
958
959 x86_pmu.event_constraints = intel_westmere_event_constraints;
960 pr_cont("Westmere events, ");
961 break;
962
963 default:
964 /*
965 * default constraints for v2 and up
966 */
967 x86_pmu.event_constraints = intel_gen_event_constraints;
968 pr_cont("generic architected perfmon, ");
969 }
970 return 0;
971}
972
973#else /* CONFIG_CPU_SUP_INTEL */
974
975static int intel_pmu_init(void)
976{
977 return 0;
978}
979
980#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
new file mode 100644
index 000000000000..a4e67b99d91c
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -0,0 +1,157 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/*
4 * Not sure about some of these
5 */
6static const u64 p6_perfmon_event_map[] =
7{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
10 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
11 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
12 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
13 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
14 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
15};
16
17static u64 p6_pmu_event_map(int hw_event)
18{
19 return p6_perfmon_event_map[hw_event];
20}
21
22/*
23 * Event setting that is specified not to count anything.
24 * We use this to effectively disable a counter.
25 *
26 * L2_RQSTS with 0 MESI unit mask.
27 */
28#define P6_NOP_EVENT 0x0000002EULL
29
30static u64 p6_pmu_raw_event(u64 hw_event)
31{
32#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
33#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
34#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
35#define P6_EVNTSEL_INV_MASK 0x00800000ULL
36#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
37
38#define P6_EVNTSEL_MASK \
39 (P6_EVNTSEL_EVENT_MASK | \
40 P6_EVNTSEL_UNIT_MASK | \
41 P6_EVNTSEL_EDGE_MASK | \
42 P6_EVNTSEL_INV_MASK | \
43 P6_EVNTSEL_REG_MASK)
44
45 return hw_event & P6_EVNTSEL_MASK;
46}
47
48static struct event_constraint p6_event_constraints[] =
49{
50 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
51 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
52 INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
53 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
54 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
55 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
56 EVENT_CONSTRAINT_END
57};
58
59static void p6_pmu_disable_all(void)
60{
61 u64 val;
62
63 /* p6 only has one enable register */
64 rdmsrl(MSR_P6_EVNTSEL0, val);
65 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
66 wrmsrl(MSR_P6_EVNTSEL0, val);
67}
68
69static void p6_pmu_enable_all(void)
70{
71 unsigned long val;
72
73 /* p6 only has one enable register */
74 rdmsrl(MSR_P6_EVNTSEL0, val);
75 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
76 wrmsrl(MSR_P6_EVNTSEL0, val);
77}
78
79static inline void
80p6_pmu_disable_event(struct hw_perf_event *hwc, int idx)
81{
82 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
83 u64 val = P6_NOP_EVENT;
84
85 if (cpuc->enabled)
86 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
87
88 (void)checking_wrmsrl(hwc->config_base + idx, val);
89}
90
91static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
92{
93 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
94 u64 val;
95
96 val = hwc->config;
97 if (cpuc->enabled)
98 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
99
100 (void)checking_wrmsrl(hwc->config_base + idx, val);
101}
102
103static __initconst struct x86_pmu p6_pmu = {
104 .name = "p6",
105 .handle_irq = x86_pmu_handle_irq,
106 .disable_all = p6_pmu_disable_all,
107 .enable_all = p6_pmu_enable_all,
108 .enable = p6_pmu_enable_event,
109 .disable = p6_pmu_disable_event,
110 .eventsel = MSR_P6_EVNTSEL0,
111 .perfctr = MSR_P6_PERFCTR0,
112 .event_map = p6_pmu_event_map,
113 .raw_event = p6_pmu_raw_event,
114 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
115 .apic = 1,
116 .max_period = (1ULL << 31) - 1,
117 .version = 0,
118 .num_events = 2,
119 /*
120 * Events have 40 bits implemented. However they are designed such
121 * that bits [32-39] are sign extensions of bit 31. As such the
122 * effective width of a event for P6-like PMU is 32 bits only.
123 *
124 * See IA-32 Intel Architecture Software developer manual Vol 3B
125 */
126 .event_bits = 32,
127 .event_mask = (1ULL << 32) - 1,
128 .get_event_constraints = x86_get_event_constraints,
129 .event_constraints = p6_event_constraints,
130};
131
132static __init int p6_pmu_init(void)
133{
134 switch (boot_cpu_data.x86_model) {
135 case 1:
136 case 3: /* Pentium Pro */
137 case 5:
138 case 6: /* Pentium II */
139 case 7:
140 case 8:
141 case 11: /* Pentium III */
142 case 9:
143 case 13:
144 /* Pentium M */
145 break;
146 default:
147 pr_cont("unsupported p6 CPU model %d ",
148 boot_cpu_data.x86_model);
149 return -ENODEV;
150 }
151
152 x86_pmu = p6_pmu;
153
154 return 0;
155}
156
157#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 898df9719afb..fb329e9f8494 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -115,17 +115,6 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
115 115
116 return !test_bit(counter, perfctr_nmi_owner); 116 return !test_bit(counter, perfctr_nmi_owner);
117} 117}
118
119/* checks the an msr for availability */
120int avail_to_resrv_perfctr_nmi(unsigned int msr)
121{
122 unsigned int counter;
123
124 counter = nmi_perfctr_msr_to_bit(msr);
125 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
126
127 return !test_bit(counter, perfctr_nmi_owner);
128}
129EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); 118EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
130 119
131int reserve_perfctr_nmi(unsigned int msr) 120int reserve_perfctr_nmi(unsigned int msr)
@@ -691,7 +680,7 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
691 cpu_nmi_set_wd_enabled(); 680 cpu_nmi_set_wd_enabled();
692 681
693 apic_write(APIC_LVTPC, APIC_DM_NMI); 682 apic_write(APIC_LVTPC, APIC_DM_NMI);
694 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; 683 evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
695 wrmsr(evntsel_msr, evntsel, 0); 684 wrmsr(evntsel_msr, evntsel, 0);
696 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 685 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
697 return 1; 686 return 1;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index ae775ca47b25..11540a189d93 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -18,11 +18,6 @@
18 18
19#include "dumpstack.h" 19#include "dumpstack.h"
20 20
21/* Just a stub for now */
22int x86_is_stack_id(int id, char *name)
23{
24 return 0;
25}
26 21
27void dump_trace(struct task_struct *task, struct pt_regs *regs, 22void dump_trace(struct task_struct *task, struct pt_regs *regs,
28 unsigned long *stack, unsigned long bp, 23 unsigned long *stack, unsigned long bp,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 0ad9597073f5..d5e2a2ebb627 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -33,11 +33,6 @@ static char x86_stack_ids[][8] = {
33#endif 33#endif
34}; 34};
35 35
36int x86_is_stack_id(int id, char *name)
37{
38 return x86_stack_ids[id - 1] == name;
39}
40
41static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 36static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
42 unsigned *usedp, char **idp) 37 unsigned *usedp, char **idp)
43{ 38{
@@ -125,9 +120,15 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
125{ 120{
126#ifdef CONFIG_FRAME_POINTER 121#ifdef CONFIG_FRAME_POINTER
127 struct stack_frame *frame = (struct stack_frame *)bp; 122 struct stack_frame *frame = (struct stack_frame *)bp;
123 unsigned long next;
128 124
129 if (!in_irq_stack(stack, irq_stack, irq_stack_end)) 125 if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
130 return (unsigned long)frame->next_frame; 126 if (!probe_kernel_address(&frame->next_frame, next))
127 return next;
128 else
129 WARN_ONCE(1, "Perf: bad frame pointer = %p in "
130 "callchain\n", &frame->next_frame);
131 }
131#endif 132#endif
132 return bp; 133 return bp;
133} 134}
@@ -291,6 +292,7 @@ void show_registers(struct pt_regs *regs)
291 292
292 sp = regs->sp; 293 sp = regs->sp;
293 printk("CPU %d ", cpu); 294 printk("CPU %d ", cpu);
295 print_modules();
294 __show_regs(regs, 1); 296 __show_regs(regs, 1);
295 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 297 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
296 cur->comm, cur->pid, task_thread_info(cur), cur); 298 cur->comm, cur->pid, task_thread_info(cur), cur);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a1a7876cadcb..740b440fbd73 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -12,21 +12,13 @@
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/ioport.h>
16#include <linux/string.h>
17#include <linux/kexec.h>
18#include <linux/module.h>
19#include <linux/mm.h>
20#include <linux/pfn.h> 15#include <linux/pfn.h>
21#include <linux/suspend.h> 16#include <linux/suspend.h>
22#include <linux/firmware-map.h> 17#include <linux/firmware-map.h>
23 18
24#include <asm/pgtable.h>
25#include <asm/page.h>
26#include <asm/e820.h> 19#include <asm/e820.h>
27#include <asm/proto.h> 20#include <asm/proto.h>
28#include <asm/setup.h> 21#include <asm/setup.h>
29#include <asm/trampoline.h>
30 22
31/* 23/*
32 * The e820 map is the map that gets modified e.g. with command line parameters 24 * The e820 map is the map that gets modified e.g. with command line parameters
@@ -517,11 +509,19 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
517 int checktype) 509 int checktype)
518{ 510{
519 int i; 511 int i;
512 u64 end;
520 u64 real_removed_size = 0; 513 u64 real_removed_size = 0;
521 514
522 if (size > (ULLONG_MAX - start)) 515 if (size > (ULLONG_MAX - start))
523 size = ULLONG_MAX - start; 516 size = ULLONG_MAX - start;
524 517
518 end = start + size;
519 printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
520 (unsigned long long) start,
521 (unsigned long long) end);
522 e820_print_type(old_type);
523 printk(KERN_CONT "\n");
524
525 for (i = 0; i < e820.nr_map; i++) { 525 for (i = 0; i < e820.nr_map; i++) {
526 struct e820entry *ei = &e820.map[i]; 526 struct e820entry *ei = &e820.map[i];
527 u64 final_start, final_end; 527 u64 final_start, final_end;
@@ -722,319 +722,44 @@ core_initcall(e820_mark_nvs_memory);
722#endif 722#endif
723 723
724/* 724/*
725 * Early reserved memory areas. 725 * Find a free area with specified alignment in a specific range.
726 */
727#define MAX_EARLY_RES 32
728
729struct early_res {
730 u64 start, end;
731 char name[16];
732 char overlap_ok;
733};
734static struct early_res early_res[MAX_EARLY_RES] __initdata = {
735 { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */
736#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE)
737 /*
738 * But first pinch a few for the stack/trampoline stuff
739 * FIXME: Don't need the extra page at 4K, but need to fix
740 * trampoline before removing it. (see the GDT stuff)
741 */
742 { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 },
743#endif
744
745 {}
746};
747
748static int __init find_overlapped_early(u64 start, u64 end)
749{
750 int i;
751 struct early_res *r;
752
753 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
754 r = &early_res[i];
755 if (end > r->start && start < r->end)
756 break;
757 }
758
759 return i;
760}
761
762/*
763 * Drop the i-th range from the early reservation map,
764 * by copying any higher ranges down one over it, and
765 * clearing what had been the last slot.
766 */
767static void __init drop_range(int i)
768{
769 int j;
770
771 for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
772 ;
773
774 memmove(&early_res[i], &early_res[i + 1],
775 (j - 1 - i) * sizeof(struct early_res));
776
777 early_res[j - 1].end = 0;
778}
779
780/*
781 * Split any existing ranges that:
782 * 1) are marked 'overlap_ok', and
783 * 2) overlap with the stated range [start, end)
784 * into whatever portion (if any) of the existing range is entirely
785 * below or entirely above the stated range. Drop the portion
786 * of the existing range that overlaps with the stated range,
787 * which will allow the caller of this routine to then add that
788 * stated range without conflicting with any existing range.
789 */ 726 */
790static void __init drop_overlaps_that_are_ok(u64 start, u64 end) 727u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
791{ 728{
792 int i; 729 int i;
793 struct early_res *r;
794 u64 lower_start, lower_end;
795 u64 upper_start, upper_end;
796 char name[16];
797 730
798 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { 731 for (i = 0; i < e820.nr_map; i++) {
799 r = &early_res[i]; 732 struct e820entry *ei = &e820.map[i];
733 u64 addr;
734 u64 ei_start, ei_last;
800 735
801 /* Continue past non-overlapping ranges */ 736 if (ei->type != E820_RAM)
802 if (end <= r->start || start >= r->end)
803 continue; 737 continue;
804 738
805 /* 739 ei_last = ei->addr + ei->size;
806 * Leave non-ok overlaps as is; let caller 740 ei_start = ei->addr;
807 * panic "Overlapping early reservations" 741 addr = find_early_area(ei_start, ei_last, start, end,
808 * when it hits this overlap. 742 size, align);
809 */
810 if (!r->overlap_ok)
811 return;
812
813 /*
814 * We have an ok overlap. We will drop it from the early
815 * reservation map, and add back in any non-overlapping
816 * portions (lower or upper) as separate, overlap_ok,
817 * non-overlapping ranges.
818 */
819
820 /* 1. Note any non-overlapping (lower or upper) ranges. */
821 strncpy(name, r->name, sizeof(name) - 1);
822
823 lower_start = lower_end = 0;
824 upper_start = upper_end = 0;
825 if (r->start < start) {
826 lower_start = r->start;
827 lower_end = start;
828 }
829 if (r->end > end) {
830 upper_start = end;
831 upper_end = r->end;
832 }
833
834 /* 2. Drop the original ok overlapping range */
835 drop_range(i);
836
837 i--; /* resume for-loop on copied down entry */
838
839 /* 3. Add back in any non-overlapping ranges. */
840 if (lower_end)
841 reserve_early_overlap_ok(lower_start, lower_end, name);
842 if (upper_end)
843 reserve_early_overlap_ok(upper_start, upper_end, name);
844 }
845}
846
847static void __init __reserve_early(u64 start, u64 end, char *name,
848 int overlap_ok)
849{
850 int i;
851 struct early_res *r;
852
853 i = find_overlapped_early(start, end);
854 if (i >= MAX_EARLY_RES)
855 panic("Too many early reservations");
856 r = &early_res[i];
857 if (r->end)
858 panic("Overlapping early reservations "
859 "%llx-%llx %s to %llx-%llx %s\n",
860 start, end - 1, name?name:"", r->start,
861 r->end - 1, r->name);
862 r->start = start;
863 r->end = end;
864 r->overlap_ok = overlap_ok;
865 if (name)
866 strncpy(r->name, name, sizeof(r->name) - 1);
867}
868
869/*
870 * A few early reservtations come here.
871 *
872 * The 'overlap_ok' in the name of this routine does -not- mean it
873 * is ok for these reservations to overlap an earlier reservation.
874 * Rather it means that it is ok for subsequent reservations to
875 * overlap this one.
876 *
877 * Use this entry point to reserve early ranges when you are doing
878 * so out of "Paranoia", reserving perhaps more memory than you need,
879 * just in case, and don't mind a subsequent overlapping reservation
880 * that is known to be needed.
881 *
882 * The drop_overlaps_that_are_ok() call here isn't really needed.
883 * It would be needed if we had two colliding 'overlap_ok'
884 * reservations, so that the second such would not panic on the
885 * overlap with the first. We don't have any such as of this
886 * writing, but might as well tolerate such if it happens in
887 * the future.
888 */
889void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
890{
891 drop_overlaps_that_are_ok(start, end);
892 __reserve_early(start, end, name, 1);
893}
894
895/*
896 * Most early reservations come here.
897 *
898 * We first have drop_overlaps_that_are_ok() drop any pre-existing
899 * 'overlap_ok' ranges, so that we can then reserve this memory
900 * range without risk of panic'ing on an overlapping overlap_ok
901 * early reservation.
902 */
903void __init reserve_early(u64 start, u64 end, char *name)
904{
905 if (start >= end)
906 return;
907
908 drop_overlaps_that_are_ok(start, end);
909 __reserve_early(start, end, name, 0);
910}
911
912void __init free_early(u64 start, u64 end)
913{
914 struct early_res *r;
915 int i;
916
917 i = find_overlapped_early(start, end);
918 r = &early_res[i];
919 if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
920 panic("free_early on not reserved area: %llx-%llx!",
921 start, end - 1);
922
923 drop_range(i);
924}
925
926void __init early_res_to_bootmem(u64 start, u64 end)
927{
928 int i, count;
929 u64 final_start, final_end;
930
931 count = 0;
932 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
933 count++;
934
935 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
936 count, start, end);
937 for (i = 0; i < count; i++) {
938 struct early_res *r = &early_res[i];
939 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
940 r->start, r->end, r->name);
941 final_start = max(start, r->start);
942 final_end = min(end, r->end);
943 if (final_start >= final_end) {
944 printk(KERN_CONT "\n");
945 continue;
946 }
947 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
948 final_start, final_end);
949 reserve_bootmem_generic(final_start, final_end - final_start,
950 BOOTMEM_DEFAULT);
951 }
952}
953 743
954/* Check for already reserved areas */ 744 if (addr != -1ULL)
955static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) 745 return addr;
956{
957 int i;
958 u64 addr = *addrp;
959 int changed = 0;
960 struct early_res *r;
961again:
962 i = find_overlapped_early(addr, addr + size);
963 r = &early_res[i];
964 if (i < MAX_EARLY_RES && r->end) {
965 *addrp = addr = round_up(r->end, align);
966 changed = 1;
967 goto again;
968 } 746 }
969 return changed; 747 return -1ULL;
970} 748}
971 749
972/* Check for already reserved areas */ 750u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
973static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
974{ 751{
975 int i; 752 return find_e820_area(start, end, size, align);
976 u64 addr = *addrp, last;
977 u64 size = *sizep;
978 int changed = 0;
979again:
980 last = addr + size;
981 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
982 struct early_res *r = &early_res[i];
983 if (last > r->start && addr < r->start) {
984 size = r->start - addr;
985 changed = 1;
986 goto again;
987 }
988 if (last > r->end && addr < r->end) {
989 addr = round_up(r->end, align);
990 size = last - addr;
991 changed = 1;
992 goto again;
993 }
994 if (last <= r->end && addr >= r->start) {
995 (*sizep)++;
996 return 0;
997 }
998 }
999 if (changed) {
1000 *addrp = addr;
1001 *sizep = size;
1002 }
1003 return changed;
1004} 753}
1005 754
1006/* 755u64 __init get_max_mapped(void)
1007 * Find a free area with specified alignment in a specific range.
1008 */
1009u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
1010{ 756{
1011 int i; 757 u64 end = max_pfn_mapped;
1012 758
1013 for (i = 0; i < e820.nr_map; i++) { 759 end <<= PAGE_SHIFT;
1014 struct e820entry *ei = &e820.map[i];
1015 u64 addr, last;
1016 u64 ei_last;
1017 760
1018 if (ei->type != E820_RAM) 761 return end;
1019 continue;
1020 addr = round_up(ei->addr, align);
1021 ei_last = ei->addr + ei->size;
1022 if (addr < start)
1023 addr = round_up(start, align);
1024 if (addr >= ei_last)
1025 continue;
1026 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
1027 ;
1028 last = addr + size;
1029 if (last > ei_last)
1030 continue;
1031 if (last > end)
1032 continue;
1033 return addr;
1034 }
1035 return -1ULL;
1036} 762}
1037
1038/* 763/*
1039 * Find next free range after *start 764 * Find next free range after *start
1040 */ 765 */
@@ -1044,25 +769,19 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
1044 769
1045 for (i = 0; i < e820.nr_map; i++) { 770 for (i = 0; i < e820.nr_map; i++) {
1046 struct e820entry *ei = &e820.map[i]; 771 struct e820entry *ei = &e820.map[i];
1047 u64 addr, last; 772 u64 addr;
1048 u64 ei_last; 773 u64 ei_start, ei_last;
1049 774
1050 if (ei->type != E820_RAM) 775 if (ei->type != E820_RAM)
1051 continue; 776 continue;
1052 addr = round_up(ei->addr, align); 777
1053 ei_last = ei->addr + ei->size; 778 ei_last = ei->addr + ei->size;
1054 if (addr < start) 779 ei_start = ei->addr;
1055 addr = round_up(start, align); 780 addr = find_early_area_size(ei_start, ei_last, start,
1056 if (addr >= ei_last) 781 sizep, align);
1057 continue; 782
1058 *sizep = ei_last - addr; 783 if (addr != -1ULL)
1059 while (bad_addr_size(&addr, sizep, align) && 784 return addr;
1060 addr + *sizep <= ei_last)
1061 ;
1062 last = addr + *sizep;
1063 if (last > ei_last)
1064 continue;
1065 return addr;
1066 } 785 }
1067 786
1068 return -1ULL; 787 return -1ULL;
@@ -1421,6 +1140,8 @@ void __init e820_reserve_resources_late(void)
1421 end = MAX_RESOURCE_SIZE; 1140 end = MAX_RESOURCE_SIZE;
1422 if (start >= end) 1141 if (start >= end)
1423 continue; 1142 continue;
1143 printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
1144 start, end);
1424 reserve_region_with_split(&iomem_resource, start, end, 1145 reserve_region_with_split(&iomem_resource, start, end,
1425 "RAM buffer"); 1146 "RAM buffer");
1426 } 1147 }
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index cdcfb122f256..c2fa9b8b497e 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -362,7 +362,7 @@ void __init efi_init(void)
362 printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); 362 printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
363 early_iounmap(tmp, 2); 363 early_iounmap(tmp, 2);
364 364
365 printk(KERN_INFO "EFI v%u.%.02u by %s \n", 365 printk(KERN_INFO "EFI v%u.%.02u by %s\n",
366 efi.systab->hdr.revision >> 16, 366 efi.systab->hdr.revision >> 16,
367 efi.systab->hdr.revision & 0xffff, vendor); 367 efi.systab->hdr.revision & 0xffff, vendor);
368 368
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 309689245431..cd37469b54ee 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -30,14 +30,32 @@
30 30
31#ifdef CONFIG_DYNAMIC_FTRACE 31#ifdef CONFIG_DYNAMIC_FTRACE
32 32
33/*
34 * modifying_code is set to notify NMIs that they need to use
35 * memory barriers when entering or exiting. But we don't want
36 * to burden NMIs with unnecessary memory barriers when code
37 * modification is not being done (which is most of the time).
38 *
39 * A mutex is already held when ftrace_arch_code_modify_prepare
40 * and post_process are called. No locks need to be taken here.
41 *
42 * Stop machine will make sure currently running NMIs are done
43 * and new NMIs will see the updated variable before we need
44 * to worry about NMIs doing memory barriers.
45 */
46static int modifying_code __read_mostly;
47static DEFINE_PER_CPU(int, save_modifying_code);
48
33int ftrace_arch_code_modify_prepare(void) 49int ftrace_arch_code_modify_prepare(void)
34{ 50{
35 set_kernel_text_rw(); 51 set_kernel_text_rw();
52 modifying_code = 1;
36 return 0; 53 return 0;
37} 54}
38 55
39int ftrace_arch_code_modify_post_process(void) 56int ftrace_arch_code_modify_post_process(void)
40{ 57{
58 modifying_code = 0;
41 set_kernel_text_ro(); 59 set_kernel_text_ro();
42 return 0; 60 return 0;
43} 61}
@@ -149,6 +167,11 @@ static void ftrace_mod_code(void)
149 167
150void ftrace_nmi_enter(void) 168void ftrace_nmi_enter(void)
151{ 169{
170 __get_cpu_var(save_modifying_code) = modifying_code;
171
172 if (!__get_cpu_var(save_modifying_code))
173 return;
174
152 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { 175 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
153 smp_rmb(); 176 smp_rmb();
154 ftrace_mod_code(); 177 ftrace_mod_code();
@@ -160,6 +183,9 @@ void ftrace_nmi_enter(void)
160 183
161void ftrace_nmi_exit(void) 184void ftrace_nmi_exit(void)
162{ 185{
186 if (!__get_cpu_var(save_modifying_code))
187 return;
188
163 /* Finish all executions before clearing nmi_running */ 189 /* Finish all executions before clearing nmi_running */
164 smp_mb(); 190 smp_mb();
165 atomic_dec(&nmi_running); 191 atomic_dec(&nmi_running);
@@ -484,13 +510,3 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
484 } 510 }
485} 511}
486#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 512#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
487
488#ifdef CONFIG_FTRACE_SYSCALLS
489
490extern unsigned long *sys_call_table;
491
492unsigned long __init arch_syscall_addr(int nr)
493{
494 return (unsigned long)(&sys_call_table)[nr];
495}
496#endif
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 5051b94c9069..adedeef1dedc 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -29,6 +29,16 @@ static void __init i386_default_early_setup(void)
29 29
30void __init i386_start_kernel(void) 30void __init i386_start_kernel(void)
31{ 31{
32#ifdef CONFIG_X86_TRAMPOLINE
33 /*
34 * But first pinch a few for the stack/trampoline stuff
35 * FIXME: Don't need the extra page at 4K, but need to fix
36 * trampoline before removing it. (see the GDT stuff)
37 */
38 reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
39 "EX TRAMPOLINE");
40#endif
41
32 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 42 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
33 43
34#ifdef CONFIG_BLK_DEV_INITRD 44#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 7fd318bac59c..37c3d4b17d85 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -442,8 +442,8 @@ is386: movl $2,%ecx # set MP
442 */ 442 */
443 cmpb $0,ready 443 cmpb $0,ready
444 jne 1f 444 jne 1f
445 movl $per_cpu__gdt_page,%eax 445 movl $gdt_page,%eax
446 movl $per_cpu__stack_canary,%ecx 446 movl $stack_canary,%ecx
447 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) 447 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
448 shrl $16, %ecx 448 shrl $16, %ecx
449 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) 449 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
@@ -706,7 +706,7 @@ idt_descr:
706 .word 0 # 32 bit align gdt_desc.address 706 .word 0 # 32 bit align gdt_desc.address
707ENTRY(early_gdt_descr) 707ENTRY(early_gdt_descr)
708 .word GDT_ENTRIES*8-1 708 .word GDT_ENTRIES*8-1
709 .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ 709 .long gdt_page /* Overwritten for secondary CPUs */
710 710
711/* 711/*
712 * The boot_gdt must mirror the equivalent in setup.S and is 712 * The boot_gdt must mirror the equivalent in setup.S and is
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 2d8b5035371c..3d1e6f16b7a6 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -27,7 +27,7 @@
27#define GET_CR2_INTO_RCX movq %cr2, %rcx 27#define GET_CR2_INTO_RCX movq %cr2, %rcx
28#endif 28#endif
29 29
30/* we are not able to switch in one step to the final KERNEL ADRESS SPACE 30/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
31 * because we need identity-mapped pages. 31 * because we need identity-mapped pages.
32 * 32 *
33 */ 33 */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ad80a1c718c6..ee4fa1bfcb33 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -266,7 +266,7 @@ static void hpet_resume_device(void)
266 force_hpet_resume(); 266 force_hpet_resume();
267} 267}
268 268
269static void hpet_resume_counter(void) 269static void hpet_resume_counter(struct clocksource *cs)
270{ 270{
271 hpet_resume_device(); 271 hpet_resume_device();
272 hpet_restart_counter(); 272 hpet_restart_counter();
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 05d5fec64a94..d6cc065f519f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -212,25 +212,6 @@ static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); 212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213} 213}
214 214
215/*
216 * Store a breakpoint's encoded address, length, and type.
217 */
218static int arch_store_info(struct perf_event *bp)
219{
220 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
221 /*
222 * For kernel-addresses, either the address or symbol name can be
223 * specified.
224 */
225 if (info->name)
226 info->address = (unsigned long)
227 kallsyms_lookup_name(info->name);
228 if (info->address)
229 return 0;
230
231 return -EINVAL;
232}
233
234int arch_bp_generic_fields(int x86_len, int x86_type, 215int arch_bp_generic_fields(int x86_len, int x86_type,
235 int *gen_len, int *gen_type) 216 int *gen_len, int *gen_type)
236{ 217{
@@ -362,10 +343,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
362 return ret; 343 return ret;
363 } 344 }
364 345
365 ret = arch_store_info(bp);
366
367 if (ret < 0)
368 return ret;
369 /* 346 /*
370 * Check that the low-order bits of the address are appropriate 347 * Check that the low-order bits of the address are appropriate
371 * for the alignment implied by len. 348 * for the alignment implied by len.
@@ -502,8 +479,6 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
502 rcu_read_lock(); 479 rcu_read_lock();
503 480
504 bp = per_cpu(bp_per_reg[i], cpu); 481 bp = per_cpu(bp_per_reg[i], cpu);
505 if (bp)
506 rc = NOTIFY_DONE;
507 /* 482 /*
508 * Reset the 'i'th TRAP bit in dr6 to denote completion of 483 * Reset the 'i'th TRAP bit in dr6 to denote completion of
509 * exception handling 484 * exception handling
@@ -522,7 +497,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
522 497
523 rcu_read_unlock(); 498 rcu_read_unlock();
524 } 499 }
525 if (dr6 & (~DR_TRAP_BITS)) 500 /*
501 * Further processing in do_debug() is needed for a) user-space
502 * breakpoints (to generate signals) and b) when the system has
503 * taken exception due to multiple causes
504 */
505 if ((current->thread.debugreg6 & DR_TRAP_BITS) ||
506 (dr6 & (~DR_TRAP_BITS)))
526 rc = NOTIFY_DONE; 507 rc = NOTIFY_DONE;
527 508
528 set_debugreg(dr7, 7); 509 set_debugreg(dr7, 7);
@@ -547,8 +528,3 @@ void hw_breakpoint_pmu_read(struct perf_event *bp)
547{ 528{
548 /* TODO */ 529 /* TODO */
549} 530}
550
551void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
552{
553 /* TODO */
554}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index f2f8540a7f3d..c01a2b846d47 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -164,6 +164,11 @@ int init_fpu(struct task_struct *tsk)
164 return 0; 164 return 0;
165} 165}
166 166
167/*
168 * The xstateregs_active() routine is the same as the fpregs_active() routine,
169 * as the "regset->n" for the xstate regset will be updated based on the feature
170 * capabilites supported by the xsave.
171 */
167int fpregs_active(struct task_struct *target, const struct user_regset *regset) 172int fpregs_active(struct task_struct *target, const struct user_regset *regset)
168{ 173{
169 return tsk_used_math(target) ? regset->n : 0; 174 return tsk_used_math(target) ? regset->n : 0;
@@ -204,8 +209,6 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
204 if (ret) 209 if (ret)
205 return ret; 210 return ret;
206 211
207 set_stopped_child_used_math(target);
208
209 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 212 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
210 &target->thread.xstate->fxsave, 0, -1); 213 &target->thread.xstate->fxsave, 0, -1);
211 214
@@ -224,6 +227,68 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
224 return ret; 227 return ret;
225} 228}
226 229
230int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
231 unsigned int pos, unsigned int count,
232 void *kbuf, void __user *ubuf)
233{
234 int ret;
235
236 if (!cpu_has_xsave)
237 return -ENODEV;
238
239 ret = init_fpu(target);
240 if (ret)
241 return ret;
242
243 /*
244 * Copy the 48bytes defined by the software first into the xstate
245 * memory layout in the thread struct, so that we can copy the entire
246 * xstateregs to the user using one user_regset_copyout().
247 */
248 memcpy(&target->thread.xstate->fxsave.sw_reserved,
249 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
250
251 /*
252 * Copy the xstate memory layout.
253 */
254 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
255 &target->thread.xstate->xsave, 0, -1);
256 return ret;
257}
258
259int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
260 unsigned int pos, unsigned int count,
261 const void *kbuf, const void __user *ubuf)
262{
263 int ret;
264 struct xsave_hdr_struct *xsave_hdr;
265
266 if (!cpu_has_xsave)
267 return -ENODEV;
268
269 ret = init_fpu(target);
270 if (ret)
271 return ret;
272
273 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
274 &target->thread.xstate->xsave, 0, -1);
275
276 /*
277 * mxcsr reserved bits must be masked to zero for security reasons.
278 */
279 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
280
281 xsave_hdr = &target->thread.xstate->xsave.xsave_hdr;
282
283 xsave_hdr->xstate_bv &= pcntxt_mask;
284 /*
285 * These bits must be zero.
286 */
287 xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
288
289 return ret;
290}
291
227#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 292#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
228 293
229/* 294/*
@@ -404,8 +469,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
404 if (ret) 469 if (ret)
405 return ret; 470 return ret;
406 471
407 set_stopped_child_used_math(target);
408
409 if (!HAVE_HWFP) 472 if (!HAVE_HWFP)
410 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); 473 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
411 474
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index df89102bef80..fb725ee15f55 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -32,8 +32,14 @@
32 */ 32 */
33 33
34static int i8259A_auto_eoi; 34static int i8259A_auto_eoi;
35DEFINE_SPINLOCK(i8259A_lock); 35DEFINE_RAW_SPINLOCK(i8259A_lock);
36static void mask_and_ack_8259A(unsigned int); 36static void mask_and_ack_8259A(unsigned int);
37static void mask_8259A(void);
38static void unmask_8259A(void);
39static void disable_8259A_irq(unsigned int irq);
40static void enable_8259A_irq(unsigned int irq);
41static void init_8259A(int auto_eoi);
42static int i8259A_irq_pending(unsigned int irq);
37 43
38struct irq_chip i8259A_chip = { 44struct irq_chip i8259A_chip = {
39 .name = "XT-PIC", 45 .name = "XT-PIC",
@@ -63,51 +69,51 @@ unsigned int cached_irq_mask = 0xffff;
63 */ 69 */
64unsigned long io_apic_irqs; 70unsigned long io_apic_irqs;
65 71
66void disable_8259A_irq(unsigned int irq) 72static void disable_8259A_irq(unsigned int irq)
67{ 73{
68 unsigned int mask = 1 << irq; 74 unsigned int mask = 1 << irq;
69 unsigned long flags; 75 unsigned long flags;
70 76
71 spin_lock_irqsave(&i8259A_lock, flags); 77 raw_spin_lock_irqsave(&i8259A_lock, flags);
72 cached_irq_mask |= mask; 78 cached_irq_mask |= mask;
73 if (irq & 8) 79 if (irq & 8)
74 outb(cached_slave_mask, PIC_SLAVE_IMR); 80 outb(cached_slave_mask, PIC_SLAVE_IMR);
75 else 81 else
76 outb(cached_master_mask, PIC_MASTER_IMR); 82 outb(cached_master_mask, PIC_MASTER_IMR);
77 spin_unlock_irqrestore(&i8259A_lock, flags); 83 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
78} 84}
79 85
80void enable_8259A_irq(unsigned int irq) 86static void enable_8259A_irq(unsigned int irq)
81{ 87{
82 unsigned int mask = ~(1 << irq); 88 unsigned int mask = ~(1 << irq);
83 unsigned long flags; 89 unsigned long flags;
84 90
85 spin_lock_irqsave(&i8259A_lock, flags); 91 raw_spin_lock_irqsave(&i8259A_lock, flags);
86 cached_irq_mask &= mask; 92 cached_irq_mask &= mask;
87 if (irq & 8) 93 if (irq & 8)
88 outb(cached_slave_mask, PIC_SLAVE_IMR); 94 outb(cached_slave_mask, PIC_SLAVE_IMR);
89 else 95 else
90 outb(cached_master_mask, PIC_MASTER_IMR); 96 outb(cached_master_mask, PIC_MASTER_IMR);
91 spin_unlock_irqrestore(&i8259A_lock, flags); 97 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
92} 98}
93 99
94int i8259A_irq_pending(unsigned int irq) 100static int i8259A_irq_pending(unsigned int irq)
95{ 101{
96 unsigned int mask = 1<<irq; 102 unsigned int mask = 1<<irq;
97 unsigned long flags; 103 unsigned long flags;
98 int ret; 104 int ret;
99 105
100 spin_lock_irqsave(&i8259A_lock, flags); 106 raw_spin_lock_irqsave(&i8259A_lock, flags);
101 if (irq < 8) 107 if (irq < 8)
102 ret = inb(PIC_MASTER_CMD) & mask; 108 ret = inb(PIC_MASTER_CMD) & mask;
103 else 109 else
104 ret = inb(PIC_SLAVE_CMD) & (mask >> 8); 110 ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
105 spin_unlock_irqrestore(&i8259A_lock, flags); 111 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
106 112
107 return ret; 113 return ret;
108} 114}
109 115
110void make_8259A_irq(unsigned int irq) 116static void make_8259A_irq(unsigned int irq)
111{ 117{
112 disable_irq_nosync(irq); 118 disable_irq_nosync(irq);
113 io_apic_irqs &= ~(1<<irq); 119 io_apic_irqs &= ~(1<<irq);
@@ -150,7 +156,7 @@ static void mask_and_ack_8259A(unsigned int irq)
150 unsigned int irqmask = 1 << irq; 156 unsigned int irqmask = 1 << irq;
151 unsigned long flags; 157 unsigned long flags;
152 158
153 spin_lock_irqsave(&i8259A_lock, flags); 159 raw_spin_lock_irqsave(&i8259A_lock, flags);
154 /* 160 /*
155 * Lightweight spurious IRQ detection. We do not want 161 * Lightweight spurious IRQ detection. We do not want
156 * to overdo spurious IRQ handling - it's usually a sign 162 * to overdo spurious IRQ handling - it's usually a sign
@@ -183,7 +189,7 @@ handle_real_irq:
183 outb(cached_master_mask, PIC_MASTER_IMR); 189 outb(cached_master_mask, PIC_MASTER_IMR);
184 outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ 190 outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */
185 } 191 }
186 spin_unlock_irqrestore(&i8259A_lock, flags); 192 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
187 return; 193 return;
188 194
189spurious_8259A_irq: 195spurious_8259A_irq:
@@ -281,37 +287,37 @@ static int __init i8259A_init_sysfs(void)
281 287
282device_initcall(i8259A_init_sysfs); 288device_initcall(i8259A_init_sysfs);
283 289
284void mask_8259A(void) 290static void mask_8259A(void)
285{ 291{
286 unsigned long flags; 292 unsigned long flags;
287 293
288 spin_lock_irqsave(&i8259A_lock, flags); 294 raw_spin_lock_irqsave(&i8259A_lock, flags);
289 295
290 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 296 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
291 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 297 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
292 298
293 spin_unlock_irqrestore(&i8259A_lock, flags); 299 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
294} 300}
295 301
296void unmask_8259A(void) 302static void unmask_8259A(void)
297{ 303{
298 unsigned long flags; 304 unsigned long flags;
299 305
300 spin_lock_irqsave(&i8259A_lock, flags); 306 raw_spin_lock_irqsave(&i8259A_lock, flags);
301 307
302 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ 308 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
303 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ 309 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
304 310
305 spin_unlock_irqrestore(&i8259A_lock, flags); 311 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
306} 312}
307 313
308void init_8259A(int auto_eoi) 314static void init_8259A(int auto_eoi)
309{ 315{
310 unsigned long flags; 316 unsigned long flags;
311 317
312 i8259A_auto_eoi = auto_eoi; 318 i8259A_auto_eoi = auto_eoi;
313 319
314 spin_lock_irqsave(&i8259A_lock, flags); 320 raw_spin_lock_irqsave(&i8259A_lock, flags);
315 321
316 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 322 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
317 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 323 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
@@ -356,5 +362,49 @@ void init_8259A(int auto_eoi)
356 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ 362 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
357 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ 363 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
358 364
359 spin_unlock_irqrestore(&i8259A_lock, flags); 365 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
360} 366}
367
368/*
369 * make i8259 a driver so that we can select pic functions at run time. the goal
370 * is to make x86 binary compatible among pc compatible and non-pc compatible
371 * platforms, such as x86 MID.
372 */
373
374static void legacy_pic_noop(void) { };
375static void legacy_pic_uint_noop(unsigned int unused) { };
376static void legacy_pic_int_noop(int unused) { };
377
378static struct irq_chip dummy_pic_chip = {
379 .name = "dummy pic",
380 .mask = legacy_pic_uint_noop,
381 .unmask = legacy_pic_uint_noop,
382 .disable = legacy_pic_uint_noop,
383 .mask_ack = legacy_pic_uint_noop,
384};
385static int legacy_pic_irq_pending_noop(unsigned int irq)
386{
387 return 0;
388}
389
390struct legacy_pic null_legacy_pic = {
391 .nr_legacy_irqs = 0,
392 .chip = &dummy_pic_chip,
393 .mask_all = legacy_pic_noop,
394 .restore_mask = legacy_pic_noop,
395 .init = legacy_pic_int_noop,
396 .irq_pending = legacy_pic_irq_pending_noop,
397 .make_irq = legacy_pic_uint_noop,
398};
399
400struct legacy_pic default_legacy_pic = {
401 .nr_legacy_irqs = NR_IRQS_LEGACY,
402 .chip = &i8259A_chip,
403 .mask_all = mask_8259A,
404 .restore_mask = unmask_8259A,
405 .init = init_8259A,
406 .irq_pending = i8259A_irq_pending,
407 .make_irq = make_8259A_irq,
408};
409
410struct legacy_pic *legacy_pic = &default_legacy_pic;
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index d5932226614f..ef257fc2921b 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -84,24 +84,7 @@ static struct irqaction irq2 = {
84}; 84};
85 85
86DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 86DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
87 [0 ... IRQ0_VECTOR - 1] = -1, 87 [0 ... NR_VECTORS - 1] = -1,
88 [IRQ0_VECTOR] = 0,
89 [IRQ1_VECTOR] = 1,
90 [IRQ2_VECTOR] = 2,
91 [IRQ3_VECTOR] = 3,
92 [IRQ4_VECTOR] = 4,
93 [IRQ5_VECTOR] = 5,
94 [IRQ6_VECTOR] = 6,
95 [IRQ7_VECTOR] = 7,
96 [IRQ8_VECTOR] = 8,
97 [IRQ9_VECTOR] = 9,
98 [IRQ10_VECTOR] = 10,
99 [IRQ11_VECTOR] = 11,
100 [IRQ12_VECTOR] = 12,
101 [IRQ13_VECTOR] = 13,
102 [IRQ14_VECTOR] = 14,
103 [IRQ15_VECTOR] = 15,
104 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
105}; 88};
106 89
107int vector_used_by_percpu_irq(unsigned int vector) 90int vector_used_by_percpu_irq(unsigned int vector)
@@ -123,12 +106,12 @@ void __init init_ISA_irqs(void)
123#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) 106#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
124 init_bsp_APIC(); 107 init_bsp_APIC();
125#endif 108#endif
126 init_8259A(0); 109 legacy_pic->init(0);
127 110
128 /* 111 /*
129 * 16 old-style INTA-cycle interrupts: 112 * 16 old-style INTA-cycle interrupts:
130 */ 113 */
131 for (i = 0; i < NR_IRQS_LEGACY; i++) { 114 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
132 struct irq_desc *desc = irq_to_desc(i); 115 struct irq_desc *desc = irq_to_desc(i);
133 116
134 desc->status = IRQ_DISABLED; 117 desc->status = IRQ_DISABLED;
@@ -142,6 +125,19 @@ void __init init_ISA_irqs(void)
142 125
143void __init init_IRQ(void) 126void __init init_IRQ(void)
144{ 127{
128 int i;
129
130 /*
131 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
132 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
133 * then this configuration will likely be static after the boot. If
134 * these IRQ's are handled by more mordern controllers like IO-APIC,
135 * then this vector space can be freed and re-used dynamically as the
136 * irq's migrate etc.
137 */
138 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
139 per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
140
145 x86_init.irqs.intr_init(); 141 x86_init.irqs.intr_init();
146} 142}
147 143
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 5b8c7505b3bc..b43bbaebe2c0 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -49,6 +49,7 @@
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kdebug.h> 50#include <linux/kdebug.h>
51#include <linux/kallsyms.h> 51#include <linux/kallsyms.h>
52#include <linux/ftrace.h>
52 53
53#include <asm/cacheflush.h> 54#include <asm/cacheflush.h>
54#include <asm/desc.h> 55#include <asm/desc.h>
@@ -106,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
106}; 107};
107const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); 108const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
108 109
109/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ 110static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
110static void __kprobes set_jmp_op(void *from, void *to)
111{ 111{
112 struct __arch_jmp_op { 112 struct __arch_relative_insn {
113 char op; 113 u8 op;
114 s32 raddr; 114 s32 raddr;
115 } __attribute__((packed)) * jop; 115 } __attribute__((packed)) *insn;
116 jop = (struct __arch_jmp_op *)from; 116
117 jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); 117 insn = (struct __arch_relative_insn *)from;
118 jop->op = RELATIVEJUMP_INSTRUCTION; 118 insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
119 insn->op = op;
120}
121
122/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
123static void __kprobes synthesize_reljump(void *from, void *to)
124{
125 __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
119} 126}
120 127
121/* 128/*
@@ -202,7 +209,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
202 /* 209 /*
203 * Basically, kp->ainsn.insn has an original instruction. 210 * Basically, kp->ainsn.insn has an original instruction.
204 * However, RIP-relative instruction can not do single-stepping 211 * However, RIP-relative instruction can not do single-stepping
205 * at different place, fix_riprel() tweaks the displacement of 212 * at different place, __copy_instruction() tweaks the displacement of
206 * that instruction. In that case, we can't recover the instruction 213 * that instruction. In that case, we can't recover the instruction
207 * from the kp->ainsn.insn. 214 * from the kp->ainsn.insn.
208 * 215 *
@@ -284,21 +291,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
284} 291}
285 292
286/* 293/*
287 * Adjust the displacement if the instruction uses the %rip-relative 294 * Copy an instruction and adjust the displacement if the instruction
288 * addressing mode. 295 * uses the %rip-relative addressing mode.
289 * If it does, Return the address of the 32-bit displacement word. 296 * If it does, Return the address of the 32-bit displacement word.
290 * If not, return null. 297 * If not, return null.
291 * Only applicable to 64-bit x86. 298 * Only applicable to 64-bit x86.
292 */ 299 */
293static void __kprobes fix_riprel(struct kprobe *p) 300static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
294{ 301{
295#ifdef CONFIG_X86_64
296 struct insn insn; 302 struct insn insn;
297 kernel_insn_init(&insn, p->ainsn.insn); 303 int ret;
304 kprobe_opcode_t buf[MAX_INSN_SIZE];
298 305
306 kernel_insn_init(&insn, src);
307 if (recover) {
308 insn_get_opcode(&insn);
309 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
310 ret = recover_probed_instruction(buf,
311 (unsigned long)src);
312 if (ret)
313 return 0;
314 kernel_insn_init(&insn, buf);
315 }
316 }
317 insn_get_length(&insn);
318 memcpy(dest, insn.kaddr, insn.length);
319
320#ifdef CONFIG_X86_64
299 if (insn_rip_relative(&insn)) { 321 if (insn_rip_relative(&insn)) {
300 s64 newdisp; 322 s64 newdisp;
301 u8 *disp; 323 u8 *disp;
324 kernel_insn_init(&insn, dest);
302 insn_get_displacement(&insn); 325 insn_get_displacement(&insn);
303 /* 326 /*
304 * The copied instruction uses the %rip-relative addressing 327 * The copied instruction uses the %rip-relative addressing
@@ -312,20 +335,23 @@ static void __kprobes fix_riprel(struct kprobe *p)
312 * extension of the original signed 32-bit displacement would 335 * extension of the original signed 32-bit displacement would
313 * have given. 336 * have given.
314 */ 337 */
315 newdisp = (u8 *) p->addr + (s64) insn.displacement.value - 338 newdisp = (u8 *) src + (s64) insn.displacement.value -
316 (u8 *) p->ainsn.insn; 339 (u8 *) dest;
317 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ 340 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
318 disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); 341 disp = (u8 *) dest + insn_offset_displacement(&insn);
319 *(s32 *) disp = (s32) newdisp; 342 *(s32 *) disp = (s32) newdisp;
320 } 343 }
321#endif 344#endif
345 return insn.length;
322} 346}
323 347
324static void __kprobes arch_copy_kprobe(struct kprobe *p) 348static void __kprobes arch_copy_kprobe(struct kprobe *p)
325{ 349{
326 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); 350 /*
327 351 * Copy an instruction without recovering int3, because it will be
328 fix_riprel(p); 352 * put by another subsystem.
353 */
354 __copy_instruction(p->ainsn.insn, p->addr, 0);
329 355
330 if (can_boost(p->addr)) 356 if (can_boost(p->addr))
331 p->ainsn.boostable = 0; 357 p->ainsn.boostable = 0;
@@ -337,6 +363,9 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
337 363
338int __kprobes arch_prepare_kprobe(struct kprobe *p) 364int __kprobes arch_prepare_kprobe(struct kprobe *p)
339{ 365{
366 if (alternatives_text_reserved(p->addr, p->addr))
367 return -EINVAL;
368
340 if (!can_probe((unsigned long)p->addr)) 369 if (!can_probe((unsigned long)p->addr))
341 return -EILSEQ; 370 return -EILSEQ;
342 /* insn: must be on special executable page on x86. */ 371 /* insn: must be on special executable page on x86. */
@@ -403,18 +432,6 @@ static void __kprobes restore_btf(void)
403 update_debugctlmsr(current->thread.debugctlmsr); 432 update_debugctlmsr(current->thread.debugctlmsr);
404} 433}
405 434
406static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
407{
408 clear_btf();
409 regs->flags |= X86_EFLAGS_TF;
410 regs->flags &= ~X86_EFLAGS_IF;
411 /* single step inline if the instruction is an int3 */
412 if (p->opcode == BREAKPOINT_INSTRUCTION)
413 regs->ip = (unsigned long)p->addr;
414 else
415 regs->ip = (unsigned long)p->ainsn.insn;
416}
417
418void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
419 struct pt_regs *regs) 436 struct pt_regs *regs)
420{ 437{
@@ -426,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
426 *sara = (unsigned long) &kretprobe_trampoline; 443 *sara = (unsigned long) &kretprobe_trampoline;
427} 444}
428 445
446#ifdef CONFIG_OPTPROBES
447static int __kprobes setup_detour_execution(struct kprobe *p,
448 struct pt_regs *regs,
449 int reenter);
450#else
451#define setup_detour_execution(p, regs, reenter) (0)
452#endif
453
429static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, 454static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
430 struct kprobe_ctlblk *kcb) 455 struct kprobe_ctlblk *kcb, int reenter)
431{ 456{
432#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER) 457 if (setup_detour_execution(p, regs, reenter))
458 return;
459
460#if !defined(CONFIG_PREEMPT)
433 if (p->ainsn.boostable == 1 && !p->post_handler) { 461 if (p->ainsn.boostable == 1 && !p->post_handler) {
434 /* Boost up -- we can execute copied instructions directly */ 462 /* Boost up -- we can execute copied instructions directly */
435 reset_current_kprobe(); 463 if (!reenter)
464 reset_current_kprobe();
465 /*
466 * Reentering boosted probe doesn't reset current_kprobe,
467 * nor set current_kprobe, because it doesn't use single
468 * stepping.
469 */
436 regs->ip = (unsigned long)p->ainsn.insn; 470 regs->ip = (unsigned long)p->ainsn.insn;
437 preempt_enable_no_resched(); 471 preempt_enable_no_resched();
438 return; 472 return;
439 } 473 }
440#endif 474#endif
441 prepare_singlestep(p, regs); 475 if (reenter) {
442 kcb->kprobe_status = KPROBE_HIT_SS; 476 save_previous_kprobe(kcb);
477 set_current_kprobe(p, regs, kcb);
478 kcb->kprobe_status = KPROBE_REENTER;
479 } else
480 kcb->kprobe_status = KPROBE_HIT_SS;
481 /* Prepare real single stepping */
482 clear_btf();
483 regs->flags |= X86_EFLAGS_TF;
484 regs->flags &= ~X86_EFLAGS_IF;
485 /* single step inline if the instruction is an int3 */
486 if (p->opcode == BREAKPOINT_INSTRUCTION)
487 regs->ip = (unsigned long)p->addr;
488 else
489 regs->ip = (unsigned long)p->ainsn.insn;
443} 490}
444 491
445/* 492/*
@@ -453,11 +500,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
453 switch (kcb->kprobe_status) { 500 switch (kcb->kprobe_status) {
454 case KPROBE_HIT_SSDONE: 501 case KPROBE_HIT_SSDONE:
455 case KPROBE_HIT_ACTIVE: 502 case KPROBE_HIT_ACTIVE:
456 save_previous_kprobe(kcb);
457 set_current_kprobe(p, regs, kcb);
458 kprobes_inc_nmissed_count(p); 503 kprobes_inc_nmissed_count(p);
459 prepare_singlestep(p, regs); 504 setup_singlestep(p, regs, kcb, 1);
460 kcb->kprobe_status = KPROBE_REENTER;
461 break; 505 break;
462 case KPROBE_HIT_SS: 506 case KPROBE_HIT_SS:
463 /* A probe has been hit in the codepath leading up to, or just 507 /* A probe has been hit in the codepath leading up to, or just
@@ -532,13 +576,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
532 * more here. 576 * more here.
533 */ 577 */
534 if (!p->pre_handler || !p->pre_handler(p, regs)) 578 if (!p->pre_handler || !p->pre_handler(p, regs))
535 setup_singlestep(p, regs, kcb); 579 setup_singlestep(p, regs, kcb, 0);
536 return 1; 580 return 1;
537 } 581 }
538 } else if (kprobe_running()) { 582 } else if (kprobe_running()) {
539 p = __get_cpu_var(current_kprobe); 583 p = __get_cpu_var(current_kprobe);
540 if (p->break_handler && p->break_handler(p, regs)) { 584 if (p->break_handler && p->break_handler(p, regs)) {
541 setup_singlestep(p, regs, kcb); 585 setup_singlestep(p, regs, kcb, 0);
542 return 1; 586 return 1;
543 } 587 }
544 } /* else: not a kprobe fault; let the kernel handle it */ 588 } /* else: not a kprobe fault; let the kernel handle it */
@@ -547,6 +591,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
547 return 0; 591 return 0;
548} 592}
549 593
594#ifdef CONFIG_X86_64
595#define SAVE_REGS_STRING \
596 /* Skip cs, ip, orig_ax. */ \
597 " subq $24, %rsp\n" \
598 " pushq %rdi\n" \
599 " pushq %rsi\n" \
600 " pushq %rdx\n" \
601 " pushq %rcx\n" \
602 " pushq %rax\n" \
603 " pushq %r8\n" \
604 " pushq %r9\n" \
605 " pushq %r10\n" \
606 " pushq %r11\n" \
607 " pushq %rbx\n" \
608 " pushq %rbp\n" \
609 " pushq %r12\n" \
610 " pushq %r13\n" \
611 " pushq %r14\n" \
612 " pushq %r15\n"
613#define RESTORE_REGS_STRING \
614 " popq %r15\n" \
615 " popq %r14\n" \
616 " popq %r13\n" \
617 " popq %r12\n" \
618 " popq %rbp\n" \
619 " popq %rbx\n" \
620 " popq %r11\n" \
621 " popq %r10\n" \
622 " popq %r9\n" \
623 " popq %r8\n" \
624 " popq %rax\n" \
625 " popq %rcx\n" \
626 " popq %rdx\n" \
627 " popq %rsi\n" \
628 " popq %rdi\n" \
629 /* Skip orig_ax, ip, cs */ \
630 " addq $24, %rsp\n"
631#else
632#define SAVE_REGS_STRING \
633 /* Skip cs, ip, orig_ax and gs. */ \
634 " subl $16, %esp\n" \
635 " pushl %fs\n" \
636 " pushl %ds\n" \
637 " pushl %es\n" \
638 " pushl %eax\n" \
639 " pushl %ebp\n" \
640 " pushl %edi\n" \
641 " pushl %esi\n" \
642 " pushl %edx\n" \
643 " pushl %ecx\n" \
644 " pushl %ebx\n"
645#define RESTORE_REGS_STRING \
646 " popl %ebx\n" \
647 " popl %ecx\n" \
648 " popl %edx\n" \
649 " popl %esi\n" \
650 " popl %edi\n" \
651 " popl %ebp\n" \
652 " popl %eax\n" \
653 /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
654 " addl $24, %esp\n"
655#endif
656
550/* 657/*
551 * When a retprobed function returns, this code saves registers and 658 * When a retprobed function returns, this code saves registers and
552 * calls trampoline_handler() runs, which calls the kretprobe's handler. 659 * calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -560,65 +667,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
560 /* We don't bother saving the ss register */ 667 /* We don't bother saving the ss register */
561 " pushq %rsp\n" 668 " pushq %rsp\n"
562 " pushfq\n" 669 " pushfq\n"
563 /* 670 SAVE_REGS_STRING
564 * Skip cs, ip, orig_ax.
565 * trampoline_handler() will plug in these values
566 */
567 " subq $24, %rsp\n"
568 " pushq %rdi\n"
569 " pushq %rsi\n"
570 " pushq %rdx\n"
571 " pushq %rcx\n"
572 " pushq %rax\n"
573 " pushq %r8\n"
574 " pushq %r9\n"
575 " pushq %r10\n"
576 " pushq %r11\n"
577 " pushq %rbx\n"
578 " pushq %rbp\n"
579 " pushq %r12\n"
580 " pushq %r13\n"
581 " pushq %r14\n"
582 " pushq %r15\n"
583 " movq %rsp, %rdi\n" 671 " movq %rsp, %rdi\n"
584 " call trampoline_handler\n" 672 " call trampoline_handler\n"
585 /* Replace saved sp with true return address. */ 673 /* Replace saved sp with true return address. */
586 " movq %rax, 152(%rsp)\n" 674 " movq %rax, 152(%rsp)\n"
587 " popq %r15\n" 675 RESTORE_REGS_STRING
588 " popq %r14\n"
589 " popq %r13\n"
590 " popq %r12\n"
591 " popq %rbp\n"
592 " popq %rbx\n"
593 " popq %r11\n"
594 " popq %r10\n"
595 " popq %r9\n"
596 " popq %r8\n"
597 " popq %rax\n"
598 " popq %rcx\n"
599 " popq %rdx\n"
600 " popq %rsi\n"
601 " popq %rdi\n"
602 /* Skip orig_ax, ip, cs */
603 " addq $24, %rsp\n"
604 " popfq\n" 676 " popfq\n"
605#else 677#else
606 " pushf\n" 678 " pushf\n"
607 /* 679 SAVE_REGS_STRING
608 * Skip cs, ip, orig_ax and gs.
609 * trampoline_handler() will plug in these values
610 */
611 " subl $16, %esp\n"
612 " pushl %fs\n"
613 " pushl %es\n"
614 " pushl %ds\n"
615 " pushl %eax\n"
616 " pushl %ebp\n"
617 " pushl %edi\n"
618 " pushl %esi\n"
619 " pushl %edx\n"
620 " pushl %ecx\n"
621 " pushl %ebx\n"
622 " movl %esp, %eax\n" 680 " movl %esp, %eax\n"
623 " call trampoline_handler\n" 681 " call trampoline_handler\n"
624 /* Move flags to cs */ 682 /* Move flags to cs */
@@ -626,15 +684,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
626 " movl %edx, 52(%esp)\n" 684 " movl %edx, 52(%esp)\n"
627 /* Replace saved flags with true return address. */ 685 /* Replace saved flags with true return address. */
628 " movl %eax, 56(%esp)\n" 686 " movl %eax, 56(%esp)\n"
629 " popl %ebx\n" 687 RESTORE_REGS_STRING
630 " popl %ecx\n"
631 " popl %edx\n"
632 " popl %esi\n"
633 " popl %edi\n"
634 " popl %ebp\n"
635 " popl %eax\n"
636 /* Skip ds, es, fs, gs, orig_ax and ip */
637 " addl $24, %esp\n"
638 " popf\n" 688 " popf\n"
639#endif 689#endif
640 " ret\n"); 690 " ret\n");
@@ -802,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p,
802 * These instructions can be executed directly if it 852 * These instructions can be executed directly if it
803 * jumps back to correct address. 853 * jumps back to correct address.
804 */ 854 */
805 set_jmp_op((void *)regs->ip, 855 synthesize_reljump((void *)regs->ip,
806 (void *)orig_ip + (regs->ip - copy_ip)); 856 (void *)orig_ip + (regs->ip - copy_ip));
807 p->ainsn.boostable = 1; 857 p->ainsn.boostable = 1;
808 } else { 858 } else {
809 p->ainsn.boostable = -1; 859 p->ainsn.boostable = -1;
@@ -1030,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1030 return 0; 1080 return 0;
1031} 1081}
1032 1082
1083
1084#ifdef CONFIG_OPTPROBES
1085
1086/* Insert a call instruction at address 'from', which calls address 'to'.*/
1087static void __kprobes synthesize_relcall(void *from, void *to)
1088{
1089 __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
1090}
1091
1092/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
1093static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
1094 unsigned long val)
1095{
1096#ifdef CONFIG_X86_64
1097 *addr++ = 0x48;
1098 *addr++ = 0xbf;
1099#else
1100 *addr++ = 0xb8;
1101#endif
1102 *(unsigned long *)addr = val;
1103}
1104
1105void __kprobes kprobes_optinsn_template_holder(void)
1106{
1107 asm volatile (
1108 ".global optprobe_template_entry\n"
1109 "optprobe_template_entry: \n"
1110#ifdef CONFIG_X86_64
1111 /* We don't bother saving the ss register */
1112 " pushq %rsp\n"
1113 " pushfq\n"
1114 SAVE_REGS_STRING
1115 " movq %rsp, %rsi\n"
1116 ".global optprobe_template_val\n"
1117 "optprobe_template_val: \n"
1118 ASM_NOP5
1119 ASM_NOP5
1120 ".global optprobe_template_call\n"
1121 "optprobe_template_call: \n"
1122 ASM_NOP5
1123 /* Move flags to rsp */
1124 " movq 144(%rsp), %rdx\n"
1125 " movq %rdx, 152(%rsp)\n"
1126 RESTORE_REGS_STRING
1127 /* Skip flags entry */
1128 " addq $8, %rsp\n"
1129 " popfq\n"
1130#else /* CONFIG_X86_32 */
1131 " pushf\n"
1132 SAVE_REGS_STRING
1133 " movl %esp, %edx\n"
1134 ".global optprobe_template_val\n"
1135 "optprobe_template_val: \n"
1136 ASM_NOP5
1137 ".global optprobe_template_call\n"
1138 "optprobe_template_call: \n"
1139 ASM_NOP5
1140 RESTORE_REGS_STRING
1141 " addl $4, %esp\n" /* skip cs */
1142 " popf\n"
1143#endif
1144 ".global optprobe_template_end\n"
1145 "optprobe_template_end: \n");
1146}
1147
1148#define TMPL_MOVE_IDX \
1149 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
1150#define TMPL_CALL_IDX \
1151 ((long)&optprobe_template_call - (long)&optprobe_template_entry)
1152#define TMPL_END_IDX \
1153 ((long)&optprobe_template_end - (long)&optprobe_template_entry)
1154
1155#define INT3_SIZE sizeof(kprobe_opcode_t)
1156
1157/* Optimized kprobe call back function: called from optinsn */
1158static void __kprobes optimized_callback(struct optimized_kprobe *op,
1159 struct pt_regs *regs)
1160{
1161 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1162
1163 preempt_disable();
1164 if (kprobe_running()) {
1165 kprobes_inc_nmissed_count(&op->kp);
1166 } else {
1167 /* Save skipped registers */
1168#ifdef CONFIG_X86_64
1169 regs->cs = __KERNEL_CS;
1170#else
1171 regs->cs = __KERNEL_CS | get_kernel_rpl();
1172 regs->gs = 0;
1173#endif
1174 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
1175 regs->orig_ax = ~0UL;
1176
1177 __get_cpu_var(current_kprobe) = &op->kp;
1178 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1179 opt_pre_handler(&op->kp, regs);
1180 __get_cpu_var(current_kprobe) = NULL;
1181 }
1182 preempt_enable_no_resched();
1183}
1184
1185static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
1186{
1187 int len = 0, ret;
1188
1189 while (len < RELATIVEJUMP_SIZE) {
1190 ret = __copy_instruction(dest + len, src + len, 1);
1191 if (!ret || !can_boost(dest + len))
1192 return -EINVAL;
1193 len += ret;
1194 }
1195 /* Check whether the address range is reserved */
1196 if (ftrace_text_reserved(src, src + len - 1) ||
1197 alternatives_text_reserved(src, src + len - 1))
1198 return -EBUSY;
1199
1200 return len;
1201}
1202
1203/* Check whether insn is indirect jump */
1204static int __kprobes insn_is_indirect_jump(struct insn *insn)
1205{
1206 return ((insn->opcode.bytes[0] == 0xff &&
1207 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
1208 insn->opcode.bytes[0] == 0xea); /* Segment based jump */
1209}
1210
1211/* Check whether insn jumps into specified address range */
1212static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
1213{
1214 unsigned long target = 0;
1215
1216 switch (insn->opcode.bytes[0]) {
1217 case 0xe0: /* loopne */
1218 case 0xe1: /* loope */
1219 case 0xe2: /* loop */
1220 case 0xe3: /* jcxz */
1221 case 0xe9: /* near relative jump */
1222 case 0xeb: /* short relative jump */
1223 break;
1224 case 0x0f:
1225 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
1226 break;
1227 return 0;
1228 default:
1229 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
1230 break;
1231 return 0;
1232 }
1233 target = (unsigned long)insn->next_byte + insn->immediate.value;
1234
1235 return (start <= target && target <= start + len);
1236}
1237
1238/* Decode whole function to ensure any instructions don't jump into target */
1239static int __kprobes can_optimize(unsigned long paddr)
1240{
1241 int ret;
1242 unsigned long addr, size = 0, offset = 0;
1243 struct insn insn;
1244 kprobe_opcode_t buf[MAX_INSN_SIZE];
1245 /* Dummy buffers for lookup_symbol_attrs */
1246 static char __dummy_buf[KSYM_NAME_LEN];
1247
1248 /* Lookup symbol including addr */
1249 if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
1250 return 0;
1251
1252 /* Check there is enough space for a relative jump. */
1253 if (size - offset < RELATIVEJUMP_SIZE)
1254 return 0;
1255
1256 /* Decode instructions */
1257 addr = paddr - offset;
1258 while (addr < paddr - offset + size) { /* Decode until function end */
1259 if (search_exception_tables(addr))
1260 /*
1261 * Since some fixup code will jumps into this function,
1262 * we can't optimize kprobe in this function.
1263 */
1264 return 0;
1265 kernel_insn_init(&insn, (void *)addr);
1266 insn_get_opcode(&insn);
1267 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
1268 ret = recover_probed_instruction(buf, addr);
1269 if (ret)
1270 return 0;
1271 kernel_insn_init(&insn, buf);
1272 }
1273 insn_get_length(&insn);
1274 /* Recover address */
1275 insn.kaddr = (void *)addr;
1276 insn.next_byte = (void *)(addr + insn.length);
1277 /* Check any instructions don't jump into target */
1278 if (insn_is_indirect_jump(&insn) ||
1279 insn_jump_into_range(&insn, paddr + INT3_SIZE,
1280 RELATIVE_ADDR_SIZE))
1281 return 0;
1282 addr += insn.length;
1283 }
1284
1285 return 1;
1286}
1287
1288/* Check optimized_kprobe can actually be optimized. */
1289int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
1290{
1291 int i;
1292 struct kprobe *p;
1293
1294 for (i = 1; i < op->optinsn.size; i++) {
1295 p = get_kprobe(op->kp.addr + i);
1296 if (p && !kprobe_disabled(p))
1297 return -EEXIST;
1298 }
1299
1300 return 0;
1301}
1302
1303/* Check the addr is within the optimized instructions. */
1304int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
1305 unsigned long addr)
1306{
1307 return ((unsigned long)op->kp.addr <= addr &&
1308 (unsigned long)op->kp.addr + op->optinsn.size > addr);
1309}
1310
1311/* Free optimized instruction slot */
1312static __kprobes
1313void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
1314{
1315 if (op->optinsn.insn) {
1316 free_optinsn_slot(op->optinsn.insn, dirty);
1317 op->optinsn.insn = NULL;
1318 op->optinsn.size = 0;
1319 }
1320}
1321
1322void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
1323{
1324 __arch_remove_optimized_kprobe(op, 1);
1325}
1326
1327/*
1328 * Copy replacing target instructions
1329 * Target instructions MUST be relocatable (checked inside)
1330 */
1331int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1332{
1333 u8 *buf;
1334 int ret;
1335 long rel;
1336
1337 if (!can_optimize((unsigned long)op->kp.addr))
1338 return -EILSEQ;
1339
1340 op->optinsn.insn = get_optinsn_slot();
1341 if (!op->optinsn.insn)
1342 return -ENOMEM;
1343
1344 /*
1345 * Verify if the address gap is in 2GB range, because this uses
1346 * a relative jump.
1347 */
1348 rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
1349 if (abs(rel) > 0x7fffffff)
1350 return -ERANGE;
1351
1352 buf = (u8 *)op->optinsn.insn;
1353
1354 /* Copy instructions into the out-of-line buffer */
1355 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
1356 if (ret < 0) {
1357 __arch_remove_optimized_kprobe(op, 0);
1358 return ret;
1359 }
1360 op->optinsn.size = ret;
1361
1362 /* Copy arch-dep-instance from template */
1363 memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
1364
1365 /* Set probe information */
1366 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
1367
1368 /* Set probe function call */
1369 synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
1370
1371 /* Set returning jmp instruction at the tail of out-of-line buffer */
1372 synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
1373 (u8 *)op->kp.addr + op->optinsn.size);
1374
1375 flush_icache_range((unsigned long) buf,
1376 (unsigned long) buf + TMPL_END_IDX +
1377 op->optinsn.size + RELATIVEJUMP_SIZE);
1378 return 0;
1379}
1380
1381/* Replace a breakpoint (int3) with a relative jump. */
1382int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
1383{
1384 unsigned char jmp_code[RELATIVEJUMP_SIZE];
1385 s32 rel = (s32)((long)op->optinsn.insn -
1386 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1387
1388 /* Backup instructions which will be replaced by jump address */
1389 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1390 RELATIVE_ADDR_SIZE);
1391
1392 jmp_code[0] = RELATIVEJUMP_OPCODE;
1393 *(s32 *)(&jmp_code[1]) = rel;
1394
1395 /*
1396 * text_poke_smp doesn't support NMI/MCE code modifying.
1397 * However, since kprobes itself also doesn't support NMI/MCE
1398 * code probing, it's not a problem.
1399 */
1400 text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
1401 return 0;
1402}
1403
1404/* Replace a relative jump with a breakpoint (int3). */
1405void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
1406{
1407 u8 buf[RELATIVEJUMP_SIZE];
1408
1409 /* Set int3 to first byte for kprobes */
1410 buf[0] = BREAKPOINT_INSTRUCTION;
1411 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1412 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
1413}
1414
1415static int __kprobes setup_detour_execution(struct kprobe *p,
1416 struct pt_regs *regs,
1417 int reenter)
1418{
1419 struct optimized_kprobe *op;
1420
1421 if (p->flags & KPROBE_FLAG_OPTIMIZED) {
1422 /* This kprobe is really able to run optimized path. */
1423 op = container_of(p, struct optimized_kprobe, kp);
1424 /* Detour through copied instructions */
1425 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
1426 if (!reenter)
1427 reset_current_kprobe();
1428 preempt_enable_no_resched();
1429 return 1;
1430 }
1431 return 0;
1432}
1433#endif
1434
1033int __init arch_init_kprobes(void) 1435int __init arch_init_kprobes(void)
1034{ 1436{
1035 return 0; 1437 return 0;
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index ebd193e476ca..85a343e28937 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -328,7 +328,7 @@ static int apply_microcode(int cpu)
328 cpu_num, mc_intel->hdr.rev); 328 cpu_num, mc_intel->hdr.rev);
329 return -1; 329 return -1;
330 } 330 }
331 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n", 331 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
332 cpu_num, val[1], 332 cpu_num, val[1],
333 mc_intel->hdr.date & 0xffff, 333 mc_intel->hdr.date & 0xffff,
334 mc_intel->hdr.date >> 24, 334 mc_intel->hdr.date >> 24,
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 712d15fdc416..71825806cd44 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -7,6 +7,8 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/pci.h> 8#include <linux/pci.h>
9#include <linux/dmi.h> 9#include <linux/dmi.h>
10#include <linux/range.h>
11
10#include <asm/pci-direct.h> 12#include <asm/pci-direct.h>
11#include <linux/sort.h> 13#include <linux/sort.h>
12#include <asm/io.h> 14#include <asm/io.h>
@@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
30 { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, 32 { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
31}; 33};
32 34
33struct range {
34 u64 start;
35 u64 end;
36};
37
38static int __cpuinit cmp_range(const void *x1, const void *x2) 35static int __cpuinit cmp_range(const void *x1, const void *x2)
39{ 36{
40 const struct range *r1 = x1; 37 const struct range *r1 = x1;
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 3b7078abc871..0aad8670858e 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -10,8 +10,211 @@
10 * of the License. 10 * of the License.
11 */ 11 */
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/sfi.h>
15#include <linux/irq.h>
16#include <linux/module.h>
13 17
14#include <asm/setup.h> 18#include <asm/setup.h>
19#include <asm/mpspec_def.h>
20#include <asm/hw_irq.h>
21#include <asm/apic.h>
22#include <asm/io_apic.h>
23#include <asm/mrst.h>
24#include <asm/io.h>
25#include <asm/i8259.h>
26#include <asm/apb_timer.h>
27
28static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
29static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
30int sfi_mtimer_num;
31
32struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
33EXPORT_SYMBOL_GPL(sfi_mrtc_array);
34int sfi_mrtc_num;
35
36static inline void assign_to_mp_irq(struct mpc_intsrc *m,
37 struct mpc_intsrc *mp_irq)
38{
39 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
40}
41
42static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq,
43 struct mpc_intsrc *m)
44{
45 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
46}
47
48static void save_mp_irq(struct mpc_intsrc *m)
49{
50 int i;
51
52 for (i = 0; i < mp_irq_entries; i++) {
53 if (!mp_irq_cmp(&mp_irqs[i], m))
54 return;
55 }
56
57 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
58 if (++mp_irq_entries == MAX_IRQ_SOURCES)
59 panic("Max # of irq sources exceeded!!\n");
60}
61
62/* parse all the mtimer info to a static mtimer array */
63static int __init sfi_parse_mtmr(struct sfi_table_header *table)
64{
65 struct sfi_table_simple *sb;
66 struct sfi_timer_table_entry *pentry;
67 struct mpc_intsrc mp_irq;
68 int totallen;
69
70 sb = (struct sfi_table_simple *)table;
71 if (!sfi_mtimer_num) {
72 sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
73 struct sfi_timer_table_entry);
74 pentry = (struct sfi_timer_table_entry *) sb->pentry;
75 totallen = sfi_mtimer_num * sizeof(*pentry);
76 memcpy(sfi_mtimer_array, pentry, totallen);
77 }
78
79 printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
80 pentry = sfi_mtimer_array;
81 for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
82 printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
83 " irq = %d\n", totallen, (u32)pentry->phys_addr,
84 pentry->freq_hz, pentry->irq);
85 if (!pentry->irq)
86 continue;
87 mp_irq.type = MP_IOAPIC;
88 mp_irq.irqtype = mp_INT;
89/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
90 mp_irq.irqflag = 5;
91 mp_irq.srcbus = 0;
92 mp_irq.srcbusirq = pentry->irq; /* IRQ */
93 mp_irq.dstapic = MP_APIC_ALL;
94 mp_irq.dstirq = pentry->irq;
95 save_mp_irq(&mp_irq);
96 }
97
98 return 0;
99}
100
101struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
102{
103 int i;
104 if (hint < sfi_mtimer_num) {
105 if (!sfi_mtimer_usage[hint]) {
106 pr_debug("hint taken for timer %d irq %d\n",\
107 hint, sfi_mtimer_array[hint].irq);
108 sfi_mtimer_usage[hint] = 1;
109 return &sfi_mtimer_array[hint];
110 }
111 }
112 /* take the first timer available */
113 for (i = 0; i < sfi_mtimer_num;) {
114 if (!sfi_mtimer_usage[i]) {
115 sfi_mtimer_usage[i] = 1;
116 return &sfi_mtimer_array[i];
117 }
118 i++;
119 }
120 return NULL;
121}
122
123void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
124{
125 int i;
126 for (i = 0; i < sfi_mtimer_num;) {
127 if (mtmr->irq == sfi_mtimer_array[i].irq) {
128 sfi_mtimer_usage[i] = 0;
129 return;
130 }
131 i++;
132 }
133}
134
135/* parse all the mrtc info to a global mrtc array */
136int __init sfi_parse_mrtc(struct sfi_table_header *table)
137{
138 struct sfi_table_simple *sb;
139 struct sfi_rtc_table_entry *pentry;
140 struct mpc_intsrc mp_irq;
141
142 int totallen;
143
144 sb = (struct sfi_table_simple *)table;
145 if (!sfi_mrtc_num) {
146 sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
147 struct sfi_rtc_table_entry);
148 pentry = (struct sfi_rtc_table_entry *)sb->pentry;
149 totallen = sfi_mrtc_num * sizeof(*pentry);
150 memcpy(sfi_mrtc_array, pentry, totallen);
151 }
152
153 printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
154 pentry = sfi_mrtc_array;
155 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
156 printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
157 totallen, (u32)pentry->phys_addr, pentry->irq);
158 mp_irq.type = MP_IOAPIC;
159 mp_irq.irqtype = mp_INT;
160 mp_irq.irqflag = 0;
161 mp_irq.srcbus = 0;
162 mp_irq.srcbusirq = pentry->irq; /* IRQ */
163 mp_irq.dstapic = MP_APIC_ALL;
164 mp_irq.dstirq = pentry->irq;
165 save_mp_irq(&mp_irq);
166 }
167 return 0;
168}
169
170/*
171 * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
172 * APBT but cmdline option can also override it.
173 */
174static void __cpuinit mrst_setup_secondary_clock(void)
175{
176 /* restore default lapic clock if disabled by cmdline */
177 if (disable_apbt_percpu)
178 return setup_secondary_APIC_clock();
179 apbt_setup_secondary_clock();
180}
181
182static unsigned long __init mrst_calibrate_tsc(void)
183{
184 unsigned long flags, fast_calibrate;
185
186 local_irq_save(flags);
187 fast_calibrate = apbt_quick_calibrate();
188 local_irq_restore(flags);
189
190 if (fast_calibrate)
191 return fast_calibrate;
192
193 return 0;
194}
195
196void __init mrst_time_init(void)
197{
198 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
199 pre_init_apic_IRQ0();
200 apbt_time_init();
201}
202
203void __init mrst_rtc_init(void)
204{
205 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
206}
207
208/*
209 * if we use per cpu apb timer, the bootclock already setup. if we use lapic
210 * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
211 */
212static void __init mrst_setup_boot_clock(void)
213{
214 pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu);
215 if (disable_apbt_percpu)
216 setup_boot_APIC_clock();
217};
15 218
16/* 219/*
17 * Moorestown specific x86_init function overrides and early setup 220 * Moorestown specific x86_init function overrides and early setup
@@ -21,4 +224,17 @@ void __init x86_mrst_early_setup(void)
21{ 224{
22 x86_init.resources.probe_roms = x86_init_noop; 225 x86_init.resources.probe_roms = x86_init_noop;
23 x86_init.resources.reserve_resources = x86_init_noop; 226 x86_init.resources.reserve_resources = x86_init_noop;
227
228 x86_init.timers.timer_init = mrst_time_init;
229 x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock;
230
231 x86_init.irqs.pre_vector_init = x86_init_noop;
232
233 x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
234
235 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
236 x86_init.pci.init = pci_mrst_init;
237 x86_init.pci.fixup_irqs = x86_init_noop;
238
239 legacy_pic = &null_legacy_pic;
24} 240}
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 9d1d263f786f..8297160c41b3 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -17,7 +17,9 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/io.h> 18#include <linux/io.h>
19#include <linux/string.h> 19#include <linux/string.h>
20
20#include <asm/geode.h> 21#include <asm/geode.h>
22#include <asm/setup.h>
21#include <asm/olpc.h> 23#include <asm/olpc.h>
22 24
23#ifdef CONFIG_OPEN_FIRMWARE 25#ifdef CONFIG_OPEN_FIRMWARE
@@ -243,9 +245,11 @@ static int __init olpc_init(void)
243 olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, 245 olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
244 (unsigned char *) &olpc_platform_info.ecver, 1); 246 (unsigned char *) &olpc_platform_info.ecver, 1);
245 247
246 /* check to see if the VSA exists */ 248#ifdef CONFIG_PCI_OLPC
247 if (cs5535_has_vsa2()) 249 /* If the VSA exists let it emulate PCI, if not emulate in kernel */
248 olpc_platform_info.flags |= OLPC_F_VSA; 250 if (!cs5535_has_vsa2())
251 x86_init.pci.arch_init = pci_olpc_init;
252#endif
249 253
250 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", 254 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
251 ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", 255 ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b1739d16310..1db183ed7c01 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -428,10 +428,6 @@ struct pv_mmu_ops pv_mmu_ops = {
428 .ptep_modify_prot_start = __ptep_modify_prot_start, 428 .ptep_modify_prot_start = __ptep_modify_prot_start,
429 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 429 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
430 430
431#ifdef CONFIG_HIGHPTE
432 .kmap_atomic_pte = kmap_atomic,
433#endif
434
435#if PAGETABLE_LEVELS >= 3 431#if PAGETABLE_LEVELS >= 3
436#ifdef CONFIG_X86_PAE 432#ifdef CONFIG_X86_PAE
437 .set_pte_atomic = native_set_pte_atomic, 433 .set_pte_atomic = native_set_pte_atomic,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 2bbde6078143..fb99f7edb341 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1309,7 +1309,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
1309/* 1309/*
1310 * get_tce_space_from_tar(): 1310 * get_tce_space_from_tar():
1311 * Function for kdump case. Get the tce tables from first kernel 1311 * Function for kdump case. Get the tce tables from first kernel
1312 * by reading the contents of the base adress register of calgary iommu 1312 * by reading the contents of the base address register of calgary iommu
1313 */ 1313 */
1314static void __init get_tce_space_from_tar(void) 1314static void __init get_tce_space_from_tar(void)
1315{ 1315{
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 75e14e21f61a..a4ac764a6880 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -38,7 +38,7 @@ int iommu_detected __read_mostly = 0;
38 * This variable becomes 1 if iommu=pt is passed on the kernel command line. 38 * This variable becomes 1 if iommu=pt is passed on the kernel command line.
39 * If this variable is 1, IOMMU implementations do no DMA translation for 39 * If this variable is 1, IOMMU implementations do no DMA translation for
40 * devices and allow every device to access to whole physical memory. This is 40 * devices and allow every device to access to whole physical memory. This is
41 * useful if a user want to use an IOMMU only for KVM device assignment to 41 * useful if a user wants to use an IOMMU only for KVM device assignment to
42 * guests and not for driver dma translation. 42 * guests and not for driver dma translation.
43 */ 43 */
44int iommu_pass_through __read_mostly; 44int iommu_pass_through __read_mostly;
@@ -65,7 +65,7 @@ int dma_set_mask(struct device *dev, u64 mask)
65} 65}
66EXPORT_SYMBOL(dma_set_mask); 66EXPORT_SYMBOL(dma_set_mask);
67 67
68#ifdef CONFIG_X86_64 68#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
69static __initdata void *dma32_bootmem_ptr; 69static __initdata void *dma32_bootmem_ptr;
70static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); 70static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
71 71
@@ -116,14 +116,21 @@ static void __init dma32_free_bootmem(void)
116 dma32_bootmem_ptr = NULL; 116 dma32_bootmem_ptr = NULL;
117 dma32_bootmem_size = 0; 117 dma32_bootmem_size = 0;
118} 118}
119#else
120void __init dma32_reserve_bootmem(void)
121{
122}
123static void __init dma32_free_bootmem(void)
124{
125}
126
119#endif 127#endif
120 128
121void __init pci_iommu_alloc(void) 129void __init pci_iommu_alloc(void)
122{ 130{
123#ifdef CONFIG_X86_64
124 /* free the range so iommu could get some range less than 4G */ 131 /* free the range so iommu could get some range less than 4G */
125 dma32_free_bootmem(); 132 dma32_free_bootmem();
126#endif 133
127 if (pci_swiotlb_detect()) 134 if (pci_swiotlb_detect())
128 goto out; 135 goto out;
129 136
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4e8cb4ee9fcb..ad9540676fcc 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -92,6 +92,13 @@ void exit_thread(void)
92 } 92 }
93} 93}
94 94
95void show_regs(struct pt_regs *regs)
96{
97 show_registers(regs);
98 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs),
99 regs->bp);
100}
101
95void show_regs_common(void) 102void show_regs_common(void)
96{ 103{
97 const char *board, *product; 104 const char *board, *product;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 37ad1e046aae..f6c62667e30c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -174,12 +174,6 @@ void __show_regs(struct pt_regs *regs, int all)
174 d6, d7); 174 d6, d7);
175} 175}
176 176
177void show_regs(struct pt_regs *regs)
178{
179 show_registers(regs);
180 show_trace(NULL, regs, &regs->sp, regs->bp);
181}
182
183void release_thread(struct task_struct *dead_task) 177void release_thread(struct task_struct *dead_task)
184{ 178{
185 BUG_ON(dead_task->mm); 179 BUG_ON(dead_task->mm);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 41a26a82470a..dc9690b4c4cc 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -211,12 +211,6 @@ void __show_regs(struct pt_regs *regs, int all)
211 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 211 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
212} 212}
213 213
214void show_regs(struct pt_regs *regs)
215{
216 show_registers(regs);
217 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
218}
219
220void release_thread(struct task_struct *dead_task) 214void release_thread(struct task_struct *dead_task)
221{ 215{
222 if (dead_task->mm) { 216 if (dead_task->mm) {
@@ -527,6 +521,7 @@ void set_personality_ia32(void)
527 521
528 /* Make sure to be in 32bit mode */ 522 /* Make sure to be in 32bit mode */
529 set_thread_flag(TIF_IA32); 523 set_thread_flag(TIF_IA32);
524 current->personality |= force_personality32;
530 525
531 /* Prepare the first "return" to user space */ 526 /* Prepare the first "return" to user space */
532 current_thread_info()->status |= TS_COMPAT; 527 current_thread_info()->status |= TS_COMPAT;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 017d937639fe..a503b1fd04e5 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -48,6 +48,7 @@ enum x86_regset {
48 REGSET_FP, 48 REGSET_FP,
49 REGSET_XFP, 49 REGSET_XFP,
50 REGSET_IOPERM64 = REGSET_XFP, 50 REGSET_IOPERM64 = REGSET_XFP,
51 REGSET_XSTATE,
51 REGSET_TLS, 52 REGSET_TLS,
52 REGSET_IOPERM32, 53 REGSET_IOPERM32,
53}; 54};
@@ -140,30 +141,6 @@ static const int arg_offs_table[] = {
140#endif 141#endif
141}; 142};
142 143
143/**
144 * regs_get_argument_nth() - get Nth argument at function call
145 * @regs: pt_regs which contains registers at function entry.
146 * @n: argument number.
147 *
148 * regs_get_argument_nth() returns @n th argument of a function call.
149 * Since usually the kernel stack will be changed right after function entry,
150 * you must use this at function entry. If the @n th entry is NOT in the
151 * kernel stack or pt_regs, this returns 0.
152 */
153unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
154{
155 if (n < ARRAY_SIZE(arg_offs_table))
156 return *(unsigned long *)((char *)regs + arg_offs_table[n]);
157 else {
158 /*
159 * The typical case: arg n is on the stack.
160 * (Note: stack[0] = return address, so skip it)
161 */
162 n -= ARRAY_SIZE(arg_offs_table);
163 return regs_get_kernel_stack_nth(regs, 1 + n);
164 }
165}
166
167/* 144/*
168 * does not yet catch signals sent when the child dies. 145 * does not yet catch signals sent when the child dies.
169 * in exit.c or in signal.c. 146 * in exit.c or in signal.c.
@@ -604,7 +581,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
604 struct perf_event_attr attr; 581 struct perf_event_attr attr;
605 582
606 /* 583 /*
607 * We shoud have at least an inactive breakpoint at this 584 * We should have at least an inactive breakpoint at this
608 * slot. It means the user is writing dr7 without having 585 * slot. It means the user is writing dr7 without having
609 * written the address register first 586 * written the address register first
610 */ 587 */
@@ -702,7 +679,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
702 } else if (n == 6) { 679 } else if (n == 6) {
703 val = thread->debugreg6; 680 val = thread->debugreg6;
704 } else if (n == 7) { 681 } else if (n == 7) {
705 val = ptrace_get_dr7(thread->ptrace_bps); 682 val = thread->ptrace_dr7;
706 } 683 }
707 return val; 684 return val;
708} 685}
@@ -778,8 +755,11 @@ int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
778 return rc; 755 return rc;
779 } 756 }
780 /* All that's left is DR7 */ 757 /* All that's left is DR7 */
781 if (n == 7) 758 if (n == 7) {
782 rc = ptrace_write_dr7(tsk, val); 759 rc = ptrace_write_dr7(tsk, val);
760 if (!rc)
761 thread->ptrace_dr7 = val;
762 }
783 763
784ret_path: 764ret_path:
785 return rc; 765 return rc;
@@ -1584,7 +1564,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1584 1564
1585#ifdef CONFIG_X86_64 1565#ifdef CONFIG_X86_64
1586 1566
1587static const struct user_regset x86_64_regsets[] = { 1567static struct user_regset x86_64_regsets[] __read_mostly = {
1588 [REGSET_GENERAL] = { 1568 [REGSET_GENERAL] = {
1589 .core_note_type = NT_PRSTATUS, 1569 .core_note_type = NT_PRSTATUS,
1590 .n = sizeof(struct user_regs_struct) / sizeof(long), 1570 .n = sizeof(struct user_regs_struct) / sizeof(long),
@@ -1597,6 +1577,12 @@ static const struct user_regset x86_64_regsets[] = {
1597 .size = sizeof(long), .align = sizeof(long), 1577 .size = sizeof(long), .align = sizeof(long),
1598 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1578 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1599 }, 1579 },
1580 [REGSET_XSTATE] = {
1581 .core_note_type = NT_X86_XSTATE,
1582 .size = sizeof(u64), .align = sizeof(u64),
1583 .active = xstateregs_active, .get = xstateregs_get,
1584 .set = xstateregs_set
1585 },
1600 [REGSET_IOPERM64] = { 1586 [REGSET_IOPERM64] = {
1601 .core_note_type = NT_386_IOPERM, 1587 .core_note_type = NT_386_IOPERM,
1602 .n = IO_BITMAP_LONGS, 1588 .n = IO_BITMAP_LONGS,
@@ -1622,7 +1608,7 @@ static const struct user_regset_view user_x86_64_view = {
1622#endif /* CONFIG_X86_64 */ 1608#endif /* CONFIG_X86_64 */
1623 1609
1624#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 1610#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1625static const struct user_regset x86_32_regsets[] = { 1611static struct user_regset x86_32_regsets[] __read_mostly = {
1626 [REGSET_GENERAL] = { 1612 [REGSET_GENERAL] = {
1627 .core_note_type = NT_PRSTATUS, 1613 .core_note_type = NT_PRSTATUS,
1628 .n = sizeof(struct user_regs_struct32) / sizeof(u32), 1614 .n = sizeof(struct user_regs_struct32) / sizeof(u32),
@@ -1641,6 +1627,12 @@ static const struct user_regset x86_32_regsets[] = {
1641 .size = sizeof(u32), .align = sizeof(u32), 1627 .size = sizeof(u32), .align = sizeof(u32),
1642 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1628 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1643 }, 1629 },
1630 [REGSET_XSTATE] = {
1631 .core_note_type = NT_X86_XSTATE,
1632 .size = sizeof(u64), .align = sizeof(u64),
1633 .active = xstateregs_active, .get = xstateregs_get,
1634 .set = xstateregs_set
1635 },
1644 [REGSET_TLS] = { 1636 [REGSET_TLS] = {
1645 .core_note_type = NT_386_TLS, 1637 .core_note_type = NT_386_TLS,
1646 .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, 1638 .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
@@ -1663,6 +1655,23 @@ static const struct user_regset_view user_x86_32_view = {
1663}; 1655};
1664#endif 1656#endif
1665 1657
1658/*
1659 * This represents bytes 464..511 in the memory layout exported through
1660 * the REGSET_XSTATE interface.
1661 */
1662u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
1663
1664void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
1665{
1666#ifdef CONFIG_X86_64
1667 x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
1668#endif
1669#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1670 x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64);
1671#endif
1672 xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
1673}
1674
1666const struct user_regset_view *task_user_regset_view(struct task_struct *task) 1675const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1667{ 1676{
1668#ifdef CONFIG_IA32_EMULATION 1677#ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5d9e40c58628..5d7ba1a449bd 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -121,7 +121,9 @@
121unsigned long max_low_pfn_mapped; 121unsigned long max_low_pfn_mapped;
122unsigned long max_pfn_mapped; 122unsigned long max_pfn_mapped;
123 123
124#ifdef CONFIG_DMI
124RESERVE_BRK(dmi_alloc, 65536); 125RESERVE_BRK(dmi_alloc, 65536);
126#endif
125 127
126unsigned int boot_cpu_id __read_mostly; 128unsigned int boot_cpu_id __read_mostly;
127 129
@@ -667,6 +669,23 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
667 {} 669 {}
668}; 670};
669 671
672static void __init trim_bios_range(void)
673{
674 /*
675 * A special case is the first 4Kb of memory;
676 * This is a BIOS owned area, not kernel ram, but generally
677 * not listed as such in the E820 table.
678 */
679 e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
680 /*
681 * special case: Some BIOSen report the PC BIOS
682 * area (640->1Mb) as ram even though it is not.
683 * take them out.
684 */
685 e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
686 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
687}
688
670/* 689/*
671 * Determine if we were loaded by an EFI loader. If so, then we have also been 690 * Determine if we were loaded by an EFI loader. If so, then we have also been
672 * passed the efi memmap, systab, etc., so we should use these data structures 691 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -830,7 +849,7 @@ void __init setup_arch(char **cmdline_p)
830 insert_resource(&iomem_resource, &data_resource); 849 insert_resource(&iomem_resource, &data_resource);
831 insert_resource(&iomem_resource, &bss_resource); 850 insert_resource(&iomem_resource, &bss_resource);
832 851
833 852 trim_bios_range();
834#ifdef CONFIG_X86_32 853#ifdef CONFIG_X86_32
835 if (ppro_with_ram_bug()) { 854 if (ppro_with_ram_bug()) {
836 e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, 855 e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
@@ -950,15 +969,11 @@ void __init setup_arch(char **cmdline_p)
950#endif 969#endif
951 970
952 initmem_init(0, max_pfn, acpi, k8); 971 initmem_init(0, max_pfn, acpi, k8);
972#ifndef CONFIG_NO_BOOTMEM
973 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
974#endif
953 975
954#ifdef CONFIG_X86_64
955 /*
956 * dma32_reserve_bootmem() allocates bootmem which may conflict
957 * with the crashkernel command line, so do that after
958 * reserve_crashkernel()
959 */
960 dma32_reserve_bootmem(); 976 dma32_reserve_bootmem();
961#endif
962 977
963 reserve_ibft_region(); 978 reserve_ibft_region();
964 979
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 35abcb8b00e9..ef6370b00e70 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -137,7 +137,13 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
137 137
138static void __init pcpu_fc_free(void *ptr, size_t size) 138static void __init pcpu_fc_free(void *ptr, size_t size)
139{ 139{
140#ifdef CONFIG_NO_BOOTMEM
141 u64 start = __pa(ptr);
142 u64 end = start + size;
143 free_early_partial(start, end);
144#else
140 free_bootmem(__pa(ptr), size); 145 free_bootmem(__pa(ptr), size);
146#endif
141} 147}
142 148
143static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) 149static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b4e870cbdc60..a02e80c3c54b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -48,6 +48,7 @@
48#include <linux/err.h> 48#include <linux/err.h>
49#include <linux/nmi.h> 49#include <linux/nmi.h>
50#include <linux/tboot.h> 50#include <linux/tboot.h>
51#include <linux/stackprotector.h>
51 52
52#include <asm/acpi.h> 53#include <asm/acpi.h>
53#include <asm/desc.h> 54#include <asm/desc.h>
@@ -67,6 +68,7 @@
67#include <linux/mc146818rtc.h> 68#include <linux/mc146818rtc.h>
68 69
69#include <asm/smpboot_hooks.h> 70#include <asm/smpboot_hooks.h>
71#include <asm/i8259.h>
70 72
71#ifdef CONFIG_X86_32 73#ifdef CONFIG_X86_32
72u8 apicid_2_node[MAX_APICID]; 74u8 apicid_2_node[MAX_APICID];
@@ -241,6 +243,11 @@ static void __cpuinit smp_callin(void)
241 map_cpu_to_logical_apicid(); 243 map_cpu_to_logical_apicid();
242 244
243 notify_cpu_starting(cpuid); 245 notify_cpu_starting(cpuid);
246
247 /*
248 * Need to setup vector mappings before we enable interrupts.
249 */
250 __setup_vector_irq(smp_processor_id());
244 /* 251 /*
245 * Get our bogomips. 252 * Get our bogomips.
246 * 253 *
@@ -286,9 +293,9 @@ notrace static void __cpuinit start_secondary(void *unused)
286 check_tsc_sync_target(); 293 check_tsc_sync_target();
287 294
288 if (nmi_watchdog == NMI_IO_APIC) { 295 if (nmi_watchdog == NMI_IO_APIC) {
289 disable_8259A_irq(0); 296 legacy_pic->chip->mask(0);
290 enable_NMI_through_LVT0(); 297 enable_NMI_through_LVT0();
291 enable_8259A_irq(0); 298 legacy_pic->chip->unmask(0);
292 } 299 }
293 300
294#ifdef CONFIG_X86_32 301#ifdef CONFIG_X86_32
@@ -315,15 +322,18 @@ notrace static void __cpuinit start_secondary(void *unused)
315 */ 322 */
316 ipi_call_lock(); 323 ipi_call_lock();
317 lock_vector_lock(); 324 lock_vector_lock();
318 __setup_vector_irq(smp_processor_id());
319 set_cpu_online(smp_processor_id(), true); 325 set_cpu_online(smp_processor_id(), true);
320 unlock_vector_lock(); 326 unlock_vector_lock();
321 ipi_call_unlock(); 327 ipi_call_unlock();
322 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 328 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
329 x86_platform.nmi_init();
323 330
324 /* enable local interrupts */ 331 /* enable local interrupts */
325 local_irq_enable(); 332 local_irq_enable();
326 333
334 /* to prevent fake stack check failure in clock setup */
335 boot_init_stack_canary();
336
327 x86_cpuinit.setup_percpu_clockev(); 337 x86_cpuinit.setup_percpu_clockev();
328 338
329 wmb(); 339 wmb();
@@ -1211,11 +1221,12 @@ __init void prefill_possible_map(void)
1211 1221
1212 total_cpus = max_t(int, possible, num_processors + disabled_cpus); 1222 total_cpus = max_t(int, possible, num_processors + disabled_cpus);
1213 1223
1214 if (possible > CONFIG_NR_CPUS) { 1224 /* nr_cpu_ids could be reduced via nr_cpus= */
1225 if (possible > nr_cpu_ids) {
1215 printk(KERN_WARNING 1226 printk(KERN_WARNING
1216 "%d Processors exceeds NR_CPUS limit of %d\n", 1227 "%d Processors exceeds NR_CPUS limit of %d\n",
1217 possible, CONFIG_NR_CPUS); 1228 possible, nr_cpu_ids);
1218 possible = CONFIG_NR_CPUS; 1229 possible = nr_cpu_ids;
1219 } 1230 }
1220 1231
1221 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", 1232 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index dee1ff7cba58..196552bb412c 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -25,191 +25,6 @@
25#include <asm/syscalls.h> 25#include <asm/syscalls.h>
26 26
27/* 27/*
28 * Perform the select(nd, in, out, ex, tv) and mmap() system
29 * calls. Linux/i386 didn't use to be able to handle more than
30 * 4 system call parameters, so these system calls used a memory
31 * block for parameter passing..
32 */
33
34struct mmap_arg_struct {
35 unsigned long addr;
36 unsigned long len;
37 unsigned long prot;
38 unsigned long flags;
39 unsigned long fd;
40 unsigned long offset;
41};
42
43asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
44{
45 struct mmap_arg_struct a;
46 int err = -EFAULT;
47
48 if (copy_from_user(&a, arg, sizeof(a)))
49 goto out;
50
51 err = -EINVAL;
52 if (a.offset & ~PAGE_MASK)
53 goto out;
54
55 err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags,
56 a.fd, a.offset >> PAGE_SHIFT);
57out:
58 return err;
59}
60
61
62struct sel_arg_struct {
63 unsigned long n;
64 fd_set __user *inp, *outp, *exp;
65 struct timeval __user *tvp;
66};
67
68asmlinkage int old_select(struct sel_arg_struct __user *arg)
69{
70 struct sel_arg_struct a;
71
72 if (copy_from_user(&a, arg, sizeof(a)))
73 return -EFAULT;
74 /* sys_select() does the appropriate kernel locking */
75 return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
76}
77
78/*
79 * sys_ipc() is the de-multiplexer for the SysV IPC calls..
80 *
81 * This is really horribly ugly.
82 */
83asmlinkage int sys_ipc(uint call, int first, int second,
84 int third, void __user *ptr, long fifth)
85{
86 int version, ret;
87
88 version = call >> 16; /* hack for backward compatibility */
89 call &= 0xffff;
90
91 switch (call) {
92 case SEMOP:
93 return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL);
94 case SEMTIMEDOP:
95 return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
96 (const struct timespec __user *)fifth);
97
98 case SEMGET:
99 return sys_semget(first, second, third);
100 case SEMCTL: {
101 union semun fourth;
102 if (!ptr)
103 return -EINVAL;
104 if (get_user(fourth.__pad, (void __user * __user *) ptr))
105 return -EFAULT;
106 return sys_semctl(first, second, third, fourth);
107 }
108
109 case MSGSND:
110 return sys_msgsnd(first, (struct msgbuf __user *) ptr,
111 second, third);
112 case MSGRCV:
113 switch (version) {
114 case 0: {
115 struct ipc_kludge tmp;
116 if (!ptr)
117 return -EINVAL;
118
119 if (copy_from_user(&tmp,
120 (struct ipc_kludge __user *) ptr,
121 sizeof(tmp)))
122 return -EFAULT;
123 return sys_msgrcv(first, tmp.msgp, second,
124 tmp.msgtyp, third);
125 }
126 default:
127 return sys_msgrcv(first,
128 (struct msgbuf __user *) ptr,
129 second, fifth, third);
130 }
131 case MSGGET:
132 return sys_msgget((key_t) first, second);
133 case MSGCTL:
134 return sys_msgctl(first, second, (struct msqid_ds __user *) ptr);
135
136 case SHMAT:
137 switch (version) {
138 default: {
139 ulong raddr;
140 ret = do_shmat(first, (char __user *) ptr, second, &raddr);
141 if (ret)
142 return ret;
143 return put_user(raddr, (ulong __user *) third);
144 }
145 case 1: /* iBCS2 emulator entry point */
146 if (!segment_eq(get_fs(), get_ds()))
147 return -EINVAL;
148 /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
149 return do_shmat(first, (char __user *) ptr, second, (ulong *) third);
150 }
151 case SHMDT:
152 return sys_shmdt((char __user *)ptr);
153 case SHMGET:
154 return sys_shmget(first, second, third);
155 case SHMCTL:
156 return sys_shmctl(first, second,
157 (struct shmid_ds __user *) ptr);
158 default:
159 return -ENOSYS;
160 }
161}
162
163/*
164 * Old cruft
165 */
166asmlinkage int sys_uname(struct old_utsname __user *name)
167{
168 int err;
169 if (!name)
170 return -EFAULT;
171 down_read(&uts_sem);
172 err = copy_to_user(name, utsname(), sizeof(*name));
173 up_read(&uts_sem);
174 return err? -EFAULT:0;
175}
176
177asmlinkage int sys_olduname(struct oldold_utsname __user *name)
178{
179 int error;
180
181 if (!name)
182 return -EFAULT;
183 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
184 return -EFAULT;
185
186 down_read(&uts_sem);
187
188 error = __copy_to_user(&name->sysname, &utsname()->sysname,
189 __OLD_UTS_LEN);
190 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
191 error |= __copy_to_user(&name->nodename, &utsname()->nodename,
192 __OLD_UTS_LEN);
193 error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
194 error |= __copy_to_user(&name->release, &utsname()->release,
195 __OLD_UTS_LEN);
196 error |= __put_user(0, name->release + __OLD_UTS_LEN);
197 error |= __copy_to_user(&name->version, &utsname()->version,
198 __OLD_UTS_LEN);
199 error |= __put_user(0, name->version + __OLD_UTS_LEN);
200 error |= __copy_to_user(&name->machine, &utsname()->machine,
201 __OLD_UTS_LEN);
202 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
203
204 up_read(&uts_sem);
205
206 error = error ? -EFAULT : 0;
207
208 return error;
209}
210
211
212/*
213 * Do a system call from kernel instead of calling sys_execve so we 28 * Do a system call from kernel instead of calling sys_execve so we
214 * end up with proper pt_regs. 29 * end up with proper pt_regs.
215 */ 30 */
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 8aa2057efd12..ff14a5044ce6 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -209,15 +209,3 @@ bottomup:
209 209
210 return addr; 210 return addr;
211} 211}
212
213
214SYSCALL_DEFINE1(uname, struct new_utsname __user *, name)
215{
216 int err;
217 down_read(&uts_sem);
218 err = copy_to_user(name, utsname(), sizeof(*name));
219 up_read(&uts_sem);
220 if (personality(current->personality) == PER_LINUX32)
221 err |= copy_to_user(&name->machine, "i686", 5);
222 return err ? -EFAULT : 0;
223}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 15228b5d3eb7..8b3729341216 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -81,7 +81,7 @@ ENTRY(sys_call_table)
81 .long sys_settimeofday 81 .long sys_settimeofday
82 .long sys_getgroups16 /* 80 */ 82 .long sys_getgroups16 /* 80 */
83 .long sys_setgroups16 83 .long sys_setgroups16
84 .long old_select 84 .long sys_old_select
85 .long sys_symlink 85 .long sys_symlink
86 .long sys_lstat 86 .long sys_lstat
87 .long sys_readlink /* 85 */ 87 .long sys_readlink /* 85 */
@@ -89,7 +89,7 @@ ENTRY(sys_call_table)
89 .long sys_swapon 89 .long sys_swapon
90 .long sys_reboot 90 .long sys_reboot
91 .long sys_old_readdir 91 .long sys_old_readdir
92 .long old_mmap /* 90 */ 92 .long sys_old_mmap /* 90 */
93 .long sys_munmap 93 .long sys_munmap
94 .long sys_truncate 94 .long sys_truncate
95 .long sys_ftruncate 95 .long sys_ftruncate
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index be2573448ed9..fb5cc5e14cfa 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -70,11 +70,11 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
70 * manually to deassert NMI lines for the watchdog if run 70 * manually to deassert NMI lines for the watchdog if run
71 * on an 82489DX-based system. 71 * on an 82489DX-based system.
72 */ 72 */
73 spin_lock(&i8259A_lock); 73 raw_spin_lock(&i8259A_lock);
74 outb(0x0c, PIC_MASTER_OCW3); 74 outb(0x0c, PIC_MASTER_OCW3);
75 /* Ack the IRQ; AEOI will end it automatically. */ 75 /* Ack the IRQ; AEOI will end it automatically. */
76 inb(PIC_MASTER_POLL); 76 inb(PIC_MASTER_POLL);
77 spin_unlock(&i8259A_lock); 77 raw_spin_unlock(&i8259A_lock);
78 } 78 }
79 79
80 global_clock_event->event_handler(global_clock_event); 80 global_clock_event->event_handler(global_clock_event);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 33399176512a..1168e4454188 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -534,6 +534,9 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
534 534
535 get_debugreg(dr6, 6); 535 get_debugreg(dr6, 6);
536 536
537 /* Filter out all the reserved bits which are preset to 1 */
538 dr6 &= ~DR6_RESERVED;
539
537 /* Catch kmemcheck conditions first of all! */ 540 /* Catch kmemcheck conditions first of all! */
538 if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) 541 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
539 return; 542 return;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 597683aa5ba0..9faf91ae1841 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -50,7 +50,7 @@ u64 native_sched_clock(void)
50 * unstable. We do this because unlike Time Of Day, 50 * unstable. We do this because unlike Time Of Day,
51 * the scheduler clock tolerates small errors and it's 51 * the scheduler clock tolerates small errors and it's
52 * very important for it to be as fast as the platform 52 * very important for it to be as fast as the platform
53 * can achive it. ) 53 * can achieve it. )
54 */ 54 */
55 if (unlikely(tsc_disabled)) { 55 if (unlikely(tsc_disabled)) {
56 /* No locking but a rare wrong value is not a big deal: */ 56 /* No locking but a rare wrong value is not a big deal: */
@@ -740,7 +740,7 @@ static cycle_t __vsyscall_fn vread_tsc(void)
740} 740}
741#endif 741#endif
742 742
743static void resume_tsc(void) 743static void resume_tsc(struct clocksource *cs)
744{ 744{
745 clocksource_tsc.cycle_last = 0; 745 clocksource_tsc.cycle_last = 0;
746} 746}
@@ -806,7 +806,7 @@ static void __init check_system_tsc_reliable(void)
806 unsigned long res_low, res_high; 806 unsigned long res_low, res_high;
807 807
808 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); 808 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
809 /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */ 809 /* Geode_LX - the OLPC CPU has a very reliable TSC */
810 if (res_low & RTSC_SUSP) 810 if (res_low & RTSC_SUSP)
811 tsc_clocksource_reliable = 1; 811 tsc_clocksource_reliable = 1;
812#endif 812#endif
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
index 36afb98675a4..309c70fb7759 100644
--- a/arch/x86/kernel/uv_sysfs.c
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -54,19 +54,19 @@ static int __init sgi_uv_sysfs_init(void)
54 if (!sgi_uv_kobj) 54 if (!sgi_uv_kobj)
55 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); 55 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
56 if (!sgi_uv_kobj) { 56 if (!sgi_uv_kobj) {
57 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); 57 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n");
58 return -EINVAL; 58 return -EINVAL;
59 } 59 }
60 60
61 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); 61 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
62 if (ret) { 62 if (ret) {
63 printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); 63 printk(KERN_WARNING "sysfs_create_file partition_id failed\n");
64 return ret; 64 return ret;
65 } 65 }
66 66
67 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); 67 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
68 if (ret) { 68 if (ret) {
69 printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); 69 printk(KERN_WARNING "sysfs_create_file coherence_id failed\n");
70 return ret; 70 return ret;
71 } 71 }
72 72
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 34a279a7471d..e680ea52db9b 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -49,11 +49,6 @@ extern int no_broadcast;
49char visws_board_type = -1; 49char visws_board_type = -1;
50char visws_board_rev = -1; 50char visws_board_rev = -1;
51 51
52int is_visws_box(void)
53{
54 return visws_board_type >= 0;
55}
56
57static void __init visws_time_init(void) 52static void __init visws_time_init(void)
58{ 53{
59 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 54 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
@@ -242,6 +237,8 @@ void __init visws_early_detect(void)
242 x86_init.irqs.pre_vector_init = visws_pre_intr_init; 237 x86_init.irqs.pre_vector_init = visws_pre_intr_init;
243 x86_init.irqs.trap_init = visws_trap_init; 238 x86_init.irqs.trap_init = visws_trap_init;
244 x86_init.timers.timer_init = visws_time_init; 239 x86_init.timers.timer_init = visws_time_init;
240 x86_init.pci.init = pci_visws_init;
241 x86_init.pci.init_irq = x86_init_noop;
245 242
246 /* 243 /*
247 * Install reboot quirks: 244 * Install reboot quirks:
@@ -508,7 +505,7 @@ static struct irq_chip cobalt_irq_type = {
508 */ 505 */
509static unsigned int startup_piix4_master_irq(unsigned int irq) 506static unsigned int startup_piix4_master_irq(unsigned int irq)
510{ 507{
511 init_8259A(0); 508 legacy_pic->init(0);
512 509
513 return startup_cobalt_irq(irq); 510 return startup_cobalt_irq(irq);
514} 511}
@@ -532,9 +529,6 @@ static struct irq_chip piix4_master_irq_type = {
532 529
533static struct irq_chip piix4_virtual_irq_type = { 530static struct irq_chip piix4_virtual_irq_type = {
534 .name = "PIIX4-virtual", 531 .name = "PIIX4-virtual",
535 .shutdown = disable_8259A_irq,
536 .enable = enable_8259A_irq,
537 .disable = disable_8259A_irq,
538}; 532};
539 533
540 534
@@ -559,7 +553,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
559 struct irq_desc *desc; 553 struct irq_desc *desc;
560 unsigned long flags; 554 unsigned long flags;
561 555
562 spin_lock_irqsave(&i8259A_lock, flags); 556 raw_spin_lock_irqsave(&i8259A_lock, flags);
563 557
564 /* Find out what's interrupting in the PIIX4 master 8259 */ 558 /* Find out what's interrupting in the PIIX4 master 8259 */
565 outb(0x0c, 0x20); /* OCW3 Poll command */ 559 outb(0x0c, 0x20); /* OCW3 Poll command */
@@ -596,7 +590,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
596 outb(0x60 + realirq, 0x20); 590 outb(0x60 + realirq, 0x20);
597 } 591 }
598 592
599 spin_unlock_irqrestore(&i8259A_lock, flags); 593 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
600 594
601 desc = irq_to_desc(realirq); 595 desc = irq_to_desc(realirq);
602 596
@@ -609,12 +603,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
609 handle_IRQ_event(realirq, desc->action); 603 handle_IRQ_event(realirq, desc->action);
610 604
611 if (!(desc->status & IRQ_DISABLED)) 605 if (!(desc->status & IRQ_DISABLED))
612 enable_8259A_irq(realirq); 606 legacy_pic->chip->unmask(realirq);
613 607
614 return IRQ_HANDLED; 608 return IRQ_HANDLED;
615 609
616out_unlock: 610out_unlock:
617 spin_unlock_irqrestore(&i8259A_lock, flags); 611 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
618 return IRQ_NONE; 612 return IRQ_NONE;
619} 613}
620 614
@@ -628,6 +622,12 @@ static struct irqaction cascade_action = {
628 .name = "cascade", 622 .name = "cascade",
629}; 623};
630 624
625static inline void set_piix4_virtual_irq_type(void)
626{
627 piix4_virtual_irq_type.shutdown = i8259A_chip.mask;
628 piix4_virtual_irq_type.enable = i8259A_chip.unmask;
629 piix4_virtual_irq_type.disable = i8259A_chip.mask;
630}
631 631
632void init_VISWS_APIC_irqs(void) 632void init_VISWS_APIC_irqs(void)
633{ 633{
@@ -653,6 +653,7 @@ void init_VISWS_APIC_irqs(void)
653 desc->chip = &piix4_master_irq_type; 653 desc->chip = &piix4_master_irq_type;
654 } 654 }
655 else if (i < CO_IRQ_APIC0) { 655 else if (i < CO_IRQ_APIC0) {
656 set_piix4_virtual_irq_type();
656 desc->chip = &piix4_virtual_irq_type; 657 desc->chip = &piix4_virtual_irq_type;
657 } 658 }
658 else if (IS_CO_APIC(i)) { 659 else if (IS_CO_APIC(i)) {
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index d430e4c30193..7dd599deca4a 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -33,6 +33,7 @@
33#include <asm/fixmap.h> 33#include <asm/fixmap.h>
34#include <asm/apicdef.h> 34#include <asm/apicdef.h>
35#include <asm/apic.h> 35#include <asm/apic.h>
36#include <asm/pgalloc.h>
36#include <asm/processor.h> 37#include <asm/processor.h>
37#include <asm/timer.h> 38#include <asm/timer.h>
38#include <asm/vmi_time.h> 39#include <asm/vmi_time.h>
@@ -266,30 +267,6 @@ static void vmi_nop(void)
266{ 267{
267} 268}
268 269
269#ifdef CONFIG_HIGHPTE
270static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
271{
272 void *va = kmap_atomic(page, type);
273
274 /*
275 * Internally, the VMI ROM must map virtual addresses to physical
276 * addresses for processing MMU updates. By the time MMU updates
277 * are issued, this information is typically already lost.
278 * Fortunately, the VMI provides a cache of mapping slots for active
279 * page tables.
280 *
281 * We use slot zero for the linear mapping of physical memory, and
282 * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
283 *
284 * args: SLOT VA COUNT PFN
285 */
286 BUG_ON(type != KM_PTE0 && type != KM_PTE1);
287 vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
288
289 return va;
290}
291#endif
292
293static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) 270static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
294{ 271{
295 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 272 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@@ -640,6 +617,12 @@ static inline int __init activate_vmi(void)
640 u64 reloc; 617 u64 reloc;
641 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; 618 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
642 619
620 /*
621 * Prevent page tables from being allocated in highmem, even if
622 * CONFIG_HIGHPTE is enabled.
623 */
624 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
625
643 if (call_vrom_func(vmi_rom, vmi_init) != 0) { 626 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
644 printk(KERN_ERR "VMI ROM failed to initialize!"); 627 printk(KERN_ERR "VMI ROM failed to initialize!");
645 return 0; 628 return 0;
@@ -778,10 +761,6 @@ static inline int __init activate_vmi(void)
778 761
779 /* Set linear is needed in all cases */ 762 /* Set linear is needed in all cases */
780 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); 763 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
781#ifdef CONFIG_HIGHPTE
782 if (vmi_ops.set_linear_mapping)
783 pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
784#endif
785 764
786 /* 765 /*
787 * These MUST always be patched. Don't support indirect jumps 766 * These MUST always be patched. Don't support indirect jumps
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 74c92bb194df..5e1ff66ecd73 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -79,11 +79,7 @@ unsigned long vmi_tsc_khz(void)
79 79
80static inline unsigned int vmi_get_timer_vector(void) 80static inline unsigned int vmi_get_timer_vector(void)
81{ 81{
82#ifdef CONFIG_X86_IO_APIC 82 return IRQ0_VECTOR;
83 return FIRST_DEVICE_VECTOR;
84#else
85 return FIRST_EXTERNAL_VECTOR;
86#endif
87} 83}
88 84
89/** vmi clockchip */ 85/** vmi clockchip */
@@ -171,7 +167,7 @@ static int vmi_timer_next_event(unsigned long delta,
171{ 167{
172 /* Unfortunately, set_next_event interface only passes relative 168 /* Unfortunately, set_next_event interface only passes relative
173 * expiry, but we want absolute expiry. It'd be better if were 169 * expiry, but we want absolute expiry. It'd be better if were
174 * were passed an aboslute expiry, since a bunch of time may 170 * were passed an absolute expiry, since a bunch of time may
175 * have been stolen between the time the delta is computed and 171 * have been stolen between the time the delta is computed and
176 * when we set the alarm below. */ 172 * when we set the alarm below. */
177 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT)); 173 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index f92a0da608cb..44879df55696 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -341,7 +341,7 @@ SECTIONS
341 * Per-cpu symbols which need to be offset from __per_cpu_load 341 * Per-cpu symbols which need to be offset from __per_cpu_load
342 * for the boot processor. 342 * for the boot processor.
343 */ 343 */
344#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load 344#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
345INIT_PER_CPU(gdt_page); 345INIT_PER_CPU(gdt_page);
346INIT_PER_CPU(irq_stack_union); 346INIT_PER_CPU(irq_stack_union);
347 347
@@ -352,7 +352,7 @@ INIT_PER_CPU(irq_stack_union);
352 "kernel image bigger than KERNEL_IMAGE_SIZE"); 352 "kernel image bigger than KERNEL_IMAGE_SIZE");
353 353
354#ifdef CONFIG_SMP 354#ifdef CONFIG_SMP
355. = ASSERT((per_cpu__irq_stack_union == 0), 355. = ASSERT((irq_stack_union == 0),
356 "irq_stack_union is not at start of per-cpu area"); 356 "irq_stack_union is not at start of per-cpu area");
357#endif 357#endif
358 358
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 9055e5872ff0..1c0c6ab9c60f 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -301,7 +301,8 @@ static int __init vsyscall_init(void)
301 register_sysctl_table(kernel_root_table2); 301 register_sysctl_table(kernel_root_table2);
302#endif 302#endif
303 on_each_cpu(cpu_vsyscall_init, NULL, 1); 303 on_each_cpu(cpu_vsyscall_init, NULL, 1);
304 hotcpu_notifier(cpu_vsyscall_notifier, 0); 304 /* notifier priority > KVM */
305 hotcpu_notifier(cpu_vsyscall_notifier, 30);
305 return 0; 306 return 0;
306} 307}
307 308
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 619f7f88b8cc..693920b22496 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -26,7 +26,8 @@ EXPORT_SYMBOL(__put_user_2);
26EXPORT_SYMBOL(__put_user_4); 26EXPORT_SYMBOL(__put_user_4);
27EXPORT_SYMBOL(__put_user_8); 27EXPORT_SYMBOL(__put_user_8);
28 28
29EXPORT_SYMBOL(copy_user_generic); 29EXPORT_SYMBOL(copy_user_generic_string);
30EXPORT_SYMBOL(copy_user_generic_unrolled);
30EXPORT_SYMBOL(__copy_user_nocache); 31EXPORT_SYMBOL(__copy_user_nocache);
31EXPORT_SYMBOL(_copy_from_user); 32EXPORT_SYMBOL(_copy_from_user);
32EXPORT_SYMBOL(_copy_to_user); 33EXPORT_SYMBOL(_copy_to_user);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index ccd179dec36e..61a1e8c7e19f 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -4,9 +4,11 @@
4 * For licencing details see kernel-base/COPYING 4 * For licencing details see kernel-base/COPYING
5 */ 5 */
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/ioport.h>
7 8
8#include <asm/bios_ebda.h> 9#include <asm/bios_ebda.h>
9#include <asm/paravirt.h> 10#include <asm/paravirt.h>
11#include <asm/pci_x86.h>
10#include <asm/mpspec.h> 12#include <asm/mpspec.h>
11#include <asm/setup.h> 13#include <asm/setup.h>
12#include <asm/apic.h> 14#include <asm/apic.h>
@@ -70,16 +72,25 @@ struct x86_init_ops x86_init __initdata = {
70 .iommu = { 72 .iommu = {
71 .iommu_init = iommu_init_noop, 73 .iommu_init = iommu_init_noop,
72 }, 74 },
75
76 .pci = {
77 .init = x86_default_pci_init,
78 .init_irq = x86_default_pci_init_irq,
79 .fixup_irqs = x86_default_pci_fixup_irqs,
80 },
73}; 81};
74 82
75struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 83struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
76 .setup_percpu_clockev = setup_secondary_APIC_clock, 84 .setup_percpu_clockev = setup_secondary_APIC_clock,
77}; 85};
78 86
87static void default_nmi_init(void) { };
88
79struct x86_platform_ops x86_platform = { 89struct x86_platform_ops x86_platform = {
80 .calibrate_tsc = native_calibrate_tsc, 90 .calibrate_tsc = native_calibrate_tsc,
81 .get_wallclock = mach_get_cmos_time, 91 .get_wallclock = mach_get_cmos_time,
82 .set_wallclock = mach_set_rtc_mmss, 92 .set_wallclock = mach_set_rtc_mmss,
83 .iommu_shutdown = iommu_shutdown_noop, 93 .iommu_shutdown = iommu_shutdown_noop,
84 .is_untracked_pat_range = is_ISA_range, 94 .is_untracked_pat_range = is_ISA_range,
95 .nmi_init = default_nmi_init
85}; 96};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index c5ee17e8c6d9..782c3a362ec6 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -337,6 +337,7 @@ void __ref xsave_cntxt_init(void)
337 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); 337 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
338 xstate_size = ebx; 338 xstate_size = ebx;
339 339
340 update_regset_xstate_info(xstate_size, pcntxt_mask);
340 prepare_fx_sw_frame(); 341 prepare_fx_sw_frame();
341 342
342 setup_xstate_init(); 343 setup_xstate_init();
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 4cd498332466..970bbd479516 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -29,6 +29,7 @@ config KVM
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select USER_RETURN_NOTIFIER 31 select USER_RETURN_NOTIFIER
32 select KVM_MMIO
32 ---help--- 33 ---help---
33 Support hosting fully virtualized guest machines using hardware 34 Support hosting fully virtualized guest machines using hardware
34 virtualization extensions. You will need a fairly recent 35 virtualization extensions. You will need a fairly recent
@@ -65,6 +66,7 @@ config KVM_AMD
65 66
66# OK, it's a little counter-intuitive to do this, but it puts it neatly under 67# OK, it's a little counter-intuitive to do this, but it puts it neatly under
67# the virtualization menu. 68# the virtualization menu.
69source drivers/vhost/Kconfig
68source drivers/lguest/Kconfig 70source drivers/lguest/Kconfig
69source drivers/virtio/Kconfig 71source drivers/virtio/Kconfig
70 72
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7e8faea4651e..4dade6ac0827 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -32,7 +32,7 @@
32#include <linux/module.h> 32#include <linux/module.h>
33#include <asm/kvm_emulate.h> 33#include <asm/kvm_emulate.h>
34 34
35#include "mmu.h" /* for is_long_mode() */ 35#include "x86.h"
36 36
37/* 37/*
38 * Opcode effective-address decode tables. 38 * Opcode effective-address decode tables.
@@ -76,6 +76,8 @@
76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
77#define GroupMask 0xff /* Group number stored in bits 0:7 */ 77#define GroupMask 0xff /* Group number stored in bits 0:7 */
78/* Misc flags */ 78/* Misc flags */
79#define Lock (1<<26) /* lock prefix is allowed for the instruction */
80#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
79#define No64 (1<<28) 81#define No64 (1<<28)
80/* Source 2 operand type */ 82/* Source 2 operand type */
81#define Src2None (0<<29) 83#define Src2None (0<<29)
@@ -88,39 +90,40 @@
88enum { 90enum {
89 Group1_80, Group1_81, Group1_82, Group1_83, 91 Group1_80, Group1_81, Group1_82, Group1_83,
90 Group1A, Group3_Byte, Group3, Group4, Group5, Group7, 92 Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
93 Group8, Group9,
91}; 94};
92 95
93static u32 opcode_table[256] = { 96static u32 opcode_table[256] = {
94 /* 0x00 - 0x07 */ 97 /* 0x00 - 0x07 */
95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 98 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 99 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
97 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 100 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
98 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, 101 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
99 /* 0x08 - 0x0F */ 102 /* 0x08 - 0x0F */
100 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 103 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
101 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 104 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
102 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 105 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
103 ImplicitOps | Stack | No64, 0, 106 ImplicitOps | Stack | No64, 0,
104 /* 0x10 - 0x17 */ 107 /* 0x10 - 0x17 */
105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 108 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 109 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 110 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
108 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, 111 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
109 /* 0x18 - 0x1F */ 112 /* 0x18 - 0x1F */
110 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 113 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
111 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 114 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
112 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 115 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
113 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, 116 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
114 /* 0x20 - 0x27 */ 117 /* 0x20 - 0x27 */
115 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 118 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
116 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 119 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
117 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, 120 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
118 /* 0x28 - 0x2F */ 121 /* 0x28 - 0x2F */
119 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 122 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
120 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 123 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
121 0, 0, 0, 0, 124 0, 0, 0, 0,
122 /* 0x30 - 0x37 */ 125 /* 0x30 - 0x37 */
123 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 126 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
124 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 127 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
125 0, 0, 0, 0, 128 0, 0, 0, 0,
126 /* 0x38 - 0x3F */ 129 /* 0x38 - 0x3F */
@@ -156,7 +159,7 @@ static u32 opcode_table[256] = {
156 Group | Group1_80, Group | Group1_81, 159 Group | Group1_80, Group | Group1_81,
157 Group | Group1_82, Group | Group1_83, 160 Group | Group1_82, Group | Group1_83,
158 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 161 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
159 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 162 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
160 /* 0x88 - 0x8F */ 163 /* 0x88 - 0x8F */
161 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 164 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
162 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 165 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -210,7 +213,7 @@ static u32 opcode_table[256] = {
210 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 213 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
211 /* 0xF0 - 0xF7 */ 214 /* 0xF0 - 0xF7 */
212 0, 0, 0, 0, 215 0, 0, 0, 0,
213 ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, 216 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
214 /* 0xF8 - 0xFF */ 217 /* 0xF8 - 0xFF */
215 ImplicitOps, 0, ImplicitOps, ImplicitOps, 218 ImplicitOps, 0, ImplicitOps, ImplicitOps,
216 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, 219 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
@@ -218,16 +221,20 @@ static u32 opcode_table[256] = {
218 221
219static u32 twobyte_table[256] = { 222static u32 twobyte_table[256] = {
220 /* 0x00 - 0x0F */ 223 /* 0x00 - 0x0F */
221 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 224 0, Group | GroupDual | Group7, 0, 0,
222 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 225 0, ImplicitOps, ImplicitOps | Priv, 0,
226 ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
227 0, ImplicitOps | ModRM, 0, 0,
223 /* 0x10 - 0x1F */ 228 /* 0x10 - 0x1F */
224 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 229 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
225 /* 0x20 - 0x2F */ 230 /* 0x20 - 0x2F */
226 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, 231 ModRM | ImplicitOps | Priv, ModRM | Priv,
232 ModRM | ImplicitOps | Priv, ModRM | Priv,
233 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0, 234 0, 0, 0, 0, 0, 0, 0, 0,
228 /* 0x30 - 0x3F */ 235 /* 0x30 - 0x3F */
229 ImplicitOps, 0, ImplicitOps, 0, 236 ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
230 ImplicitOps, ImplicitOps, 0, 0, 237 ImplicitOps, ImplicitOps | Priv, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0, 238 0, 0, 0, 0, 0, 0, 0, 0,
232 /* 0x40 - 0x47 */ 239 /* 0x40 - 0x47 */
233 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 240 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -257,21 +264,23 @@ static u32 twobyte_table[256] = {
257 DstMem | SrcReg | Src2CL | ModRM, 0, 0, 264 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
258 /* 0xA8 - 0xAF */ 265 /* 0xA8 - 0xAF */
259 ImplicitOps | Stack, ImplicitOps | Stack, 266 ImplicitOps | Stack, ImplicitOps | Stack,
260 0, DstMem | SrcReg | ModRM | BitOp, 267 0, DstMem | SrcReg | ModRM | BitOp | Lock,
261 DstMem | SrcReg | Src2ImmByte | ModRM, 268 DstMem | SrcReg | Src2ImmByte | ModRM,
262 DstMem | SrcReg | Src2CL | ModRM, 269 DstMem | SrcReg | Src2CL | ModRM,
263 ModRM, 0, 270 ModRM, 0,
264 /* 0xB0 - 0xB7 */ 271 /* 0xB0 - 0xB7 */
265 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, 272 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
266 DstMem | SrcReg | ModRM | BitOp, 273 0, DstMem | SrcReg | ModRM | BitOp | Lock,
267 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, 274 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
268 DstReg | SrcMem16 | ModRM | Mov, 275 DstReg | SrcMem16 | ModRM | Mov,
269 /* 0xB8 - 0xBF */ 276 /* 0xB8 - 0xBF */
270 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, 277 0, 0,
278 Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
271 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, 279 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
272 DstReg | SrcMem16 | ModRM | Mov, 280 DstReg | SrcMem16 | ModRM | Mov,
273 /* 0xC0 - 0xCF */ 281 /* 0xC0 - 0xCF */
274 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, 282 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
283 0, 0, 0, Group | GroupDual | Group9,
275 0, 0, 0, 0, 0, 0, 0, 0, 284 0, 0, 0, 0, 0, 0, 0, 0,
276 /* 0xD0 - 0xDF */ 285 /* 0xD0 - 0xDF */
277 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 286 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -283,25 +292,41 @@ static u32 twobyte_table[256] = {
283 292
284static u32 group_table[] = { 293static u32 group_table[] = {
285 [Group1_80*8] = 294 [Group1_80*8] =
286 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 295 ByteOp | DstMem | SrcImm | ModRM | Lock,
287 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 296 ByteOp | DstMem | SrcImm | ModRM | Lock,
288 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 297 ByteOp | DstMem | SrcImm | ModRM | Lock,
289 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 298 ByteOp | DstMem | SrcImm | ModRM | Lock,
299 ByteOp | DstMem | SrcImm | ModRM | Lock,
300 ByteOp | DstMem | SrcImm | ModRM | Lock,
301 ByteOp | DstMem | SrcImm | ModRM | Lock,
302 ByteOp | DstMem | SrcImm | ModRM,
290 [Group1_81*8] = 303 [Group1_81*8] =
291 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 304 DstMem | SrcImm | ModRM | Lock,
292 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 305 DstMem | SrcImm | ModRM | Lock,
293 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 306 DstMem | SrcImm | ModRM | Lock,
294 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 307 DstMem | SrcImm | ModRM | Lock,
308 DstMem | SrcImm | ModRM | Lock,
309 DstMem | SrcImm | ModRM | Lock,
310 DstMem | SrcImm | ModRM | Lock,
311 DstMem | SrcImm | ModRM,
295 [Group1_82*8] = 312 [Group1_82*8] =
296 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 313 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
297 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 314 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
298 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 315 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
299 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 316 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
317 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
318 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
319 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
320 ByteOp | DstMem | SrcImm | ModRM | No64,
300 [Group1_83*8] = 321 [Group1_83*8] =
301 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 322 DstMem | SrcImmByte | ModRM | Lock,
302 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 323 DstMem | SrcImmByte | ModRM | Lock,
303 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 324 DstMem | SrcImmByte | ModRM | Lock,
304 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 325 DstMem | SrcImmByte | ModRM | Lock,
326 DstMem | SrcImmByte | ModRM | Lock,
327 DstMem | SrcImmByte | ModRM | Lock,
328 DstMem | SrcImmByte | ModRM | Lock,
329 DstMem | SrcImmByte | ModRM,
305 [Group1A*8] = 330 [Group1A*8] =
306 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, 331 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
307 [Group3_Byte*8] = 332 [Group3_Byte*8] =
@@ -320,24 +345,39 @@ static u32 group_table[] = {
320 SrcMem | ModRM | Stack, 0, 345 SrcMem | ModRM | Stack, 0,
321 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, 346 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
322 [Group7*8] = 347 [Group7*8] =
323 0, 0, ModRM | SrcMem, ModRM | SrcMem, 348 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
324 SrcNone | ModRM | DstMem | Mov, 0, 349 SrcNone | ModRM | DstMem | Mov, 0,
325 SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, 350 SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
351 [Group8*8] =
352 0, 0, 0, 0,
353 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
354 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
355 [Group9*8] =
356 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0,
326}; 357};
327 358
328static u32 group2_table[] = { 359static u32 group2_table[] = {
329 [Group7*8] = 360 [Group7*8] =
330 SrcNone | ModRM, 0, 0, SrcNone | ModRM, 361 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM,
331 SrcNone | ModRM | DstMem | Mov, 0, 362 SrcNone | ModRM | DstMem | Mov, 0,
332 SrcMem16 | ModRM | Mov, 0, 363 SrcMem16 | ModRM | Mov, 0,
364 [Group9*8] =
365 0, 0, 0, 0, 0, 0, 0, 0,
333}; 366};
334 367
335/* EFLAGS bit definitions. */ 368/* EFLAGS bit definitions. */
369#define EFLG_ID (1<<21)
370#define EFLG_VIP (1<<20)
371#define EFLG_VIF (1<<19)
372#define EFLG_AC (1<<18)
336#define EFLG_VM (1<<17) 373#define EFLG_VM (1<<17)
337#define EFLG_RF (1<<16) 374#define EFLG_RF (1<<16)
375#define EFLG_IOPL (3<<12)
376#define EFLG_NT (1<<14)
338#define EFLG_OF (1<<11) 377#define EFLG_OF (1<<11)
339#define EFLG_DF (1<<10) 378#define EFLG_DF (1<<10)
340#define EFLG_IF (1<<9) 379#define EFLG_IF (1<<9)
380#define EFLG_TF (1<<8)
341#define EFLG_SF (1<<7) 381#define EFLG_SF (1<<7)
342#define EFLG_ZF (1<<6) 382#define EFLG_ZF (1<<6)
343#define EFLG_AF (1<<4) 383#define EFLG_AF (1<<4)
@@ -606,7 +646,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
606 646
607 if (linear < fc->start || linear >= fc->end) { 647 if (linear < fc->start || linear >= fc->end) {
608 size = min(15UL, PAGE_SIZE - offset_in_page(linear)); 648 size = min(15UL, PAGE_SIZE - offset_in_page(linear));
609 rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); 649 rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
610 if (rc) 650 if (rc)
611 return rc; 651 return rc;
612 fc->start = linear; 652 fc->start = linear;
@@ -661,11 +701,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
661 op_bytes = 3; 701 op_bytes = 3;
662 *address = 0; 702 *address = 0;
663 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, 703 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
664 ctxt->vcpu); 704 ctxt->vcpu, NULL);
665 if (rc) 705 if (rc)
666 return rc; 706 return rc;
667 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, 707 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
668 ctxt->vcpu); 708 ctxt->vcpu, NULL);
669 return rc; 709 return rc;
670} 710}
671 711
@@ -889,6 +929,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
889 929
890 switch (mode) { 930 switch (mode) {
891 case X86EMUL_MODE_REAL: 931 case X86EMUL_MODE_REAL:
932 case X86EMUL_MODE_VM86:
892 case X86EMUL_MODE_PROT16: 933 case X86EMUL_MODE_PROT16:
893 def_op_bytes = def_ad_bytes = 2; 934 def_op_bytes = def_ad_bytes = 2;
894 break; 935 break;
@@ -975,7 +1016,7 @@ done_prefixes:
975 } 1016 }
976 1017
977 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 1018 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
978 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");; 1019 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
979 return -1; 1020 return -1;
980 } 1021 }
981 1022
@@ -1196,13 +1237,56 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1196 rc = ops->read_emulated(register_address(c, ss_base(ctxt), 1237 rc = ops->read_emulated(register_address(c, ss_base(ctxt),
1197 c->regs[VCPU_REGS_RSP]), 1238 c->regs[VCPU_REGS_RSP]),
1198 dest, len, ctxt->vcpu); 1239 dest, len, ctxt->vcpu);
1199 if (rc != 0) 1240 if (rc != X86EMUL_CONTINUE)
1200 return rc; 1241 return rc;
1201 1242
1202 register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); 1243 register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
1203 return rc; 1244 return rc;
1204} 1245}
1205 1246
1247static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1248 struct x86_emulate_ops *ops,
1249 void *dest, int len)
1250{
1251 int rc;
1252 unsigned long val, change_mask;
1253 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1254 int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu);
1255
1256 rc = emulate_pop(ctxt, ops, &val, len);
1257 if (rc != X86EMUL_CONTINUE)
1258 return rc;
1259
1260 change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
1261 | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
1262
1263 switch(ctxt->mode) {
1264 case X86EMUL_MODE_PROT64:
1265 case X86EMUL_MODE_PROT32:
1266 case X86EMUL_MODE_PROT16:
1267 if (cpl == 0)
1268 change_mask |= EFLG_IOPL;
1269 if (cpl <= iopl)
1270 change_mask |= EFLG_IF;
1271 break;
1272 case X86EMUL_MODE_VM86:
1273 if (iopl < 3) {
1274 kvm_inject_gp(ctxt->vcpu, 0);
1275 return X86EMUL_PROPAGATE_FAULT;
1276 }
1277 change_mask |= EFLG_IF;
1278 break;
1279 default: /* real mode */
1280 change_mask |= (EFLG_IOPL | EFLG_IF);
1281 break;
1282 }
1283
1284 *(unsigned long *)dest =
1285 (ctxt->eflags & ~change_mask) | (val & change_mask);
1286
1287 return rc;
1288}
1289
1206static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) 1290static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1207{ 1291{
1208 struct decode_cache *c = &ctxt->decode; 1292 struct decode_cache *c = &ctxt->decode;
@@ -1225,7 +1309,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1225 if (rc != 0) 1309 if (rc != 0)
1226 return rc; 1310 return rc;
1227 1311
1228 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg); 1312 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);
1229 return rc; 1313 return rc;
1230} 1314}
1231 1315
@@ -1370,7 +1454,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1370 int rc; 1454 int rc;
1371 1455
1372 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); 1456 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1373 if (rc != 0) 1457 if (rc != X86EMUL_CONTINUE)
1374 return rc; 1458 return rc;
1375 1459
1376 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || 1460 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
@@ -1385,7 +1469,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1385 (u32) c->regs[VCPU_REGS_RBX]; 1469 (u32) c->regs[VCPU_REGS_RBX];
1386 1470
1387 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); 1471 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1388 if (rc != 0) 1472 if (rc != X86EMUL_CONTINUE)
1389 return rc; 1473 return rc;
1390 ctxt->eflags |= EFLG_ZF; 1474 ctxt->eflags |= EFLG_ZF;
1391 } 1475 }
@@ -1407,7 +1491,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1407 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1491 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1408 if (rc) 1492 if (rc)
1409 return rc; 1493 return rc;
1410 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); 1494 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);
1411 return rc; 1495 return rc;
1412} 1496}
1413 1497
@@ -1451,7 +1535,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1451 &c->dst.val, 1535 &c->dst.val,
1452 c->dst.bytes, 1536 c->dst.bytes,
1453 ctxt->vcpu); 1537 ctxt->vcpu);
1454 if (rc != 0) 1538 if (rc != X86EMUL_CONTINUE)
1455 return rc; 1539 return rc;
1456 break; 1540 break;
1457 case OP_NONE: 1541 case OP_NONE:
@@ -1514,9 +1598,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1514 u64 msr_data; 1598 u64 msr_data;
1515 1599
1516 /* syscall is not available in real mode */ 1600 /* syscall is not available in real mode */
1517 if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL 1601 if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86)
1518 || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) 1602 return X86EMUL_UNHANDLEABLE;
1519 return -1;
1520 1603
1521 setup_syscalls_segments(ctxt, &cs, &ss); 1604 setup_syscalls_segments(ctxt, &cs, &ss);
1522 1605
@@ -1553,7 +1636,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1553 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1636 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1554 } 1637 }
1555 1638
1556 return 0; 1639 return X86EMUL_CONTINUE;
1557} 1640}
1558 1641
1559static int 1642static int
@@ -1563,22 +1646,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1563 struct kvm_segment cs, ss; 1646 struct kvm_segment cs, ss;
1564 u64 msr_data; 1647 u64 msr_data;
1565 1648
1566 /* inject #UD if LOCK prefix is used */ 1649 /* inject #GP if in real mode */
1567 if (c->lock_prefix) 1650 if (ctxt->mode == X86EMUL_MODE_REAL) {
1568 return -1;
1569
1570 /* inject #GP if in real mode or paging is disabled */
1571 if (ctxt->mode == X86EMUL_MODE_REAL ||
1572 !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
1573 kvm_inject_gp(ctxt->vcpu, 0); 1651 kvm_inject_gp(ctxt->vcpu, 0);
1574 return -1; 1652 return X86EMUL_UNHANDLEABLE;
1575 } 1653 }
1576 1654
1577 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1655 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1578 * Therefore, we inject an #UD. 1656 * Therefore, we inject an #UD.
1579 */ 1657 */
1580 if (ctxt->mode == X86EMUL_MODE_PROT64) 1658 if (ctxt->mode == X86EMUL_MODE_PROT64)
1581 return -1; 1659 return X86EMUL_UNHANDLEABLE;
1582 1660
1583 setup_syscalls_segments(ctxt, &cs, &ss); 1661 setup_syscalls_segments(ctxt, &cs, &ss);
1584 1662
@@ -1587,13 +1665,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1587 case X86EMUL_MODE_PROT32: 1665 case X86EMUL_MODE_PROT32:
1588 if ((msr_data & 0xfffc) == 0x0) { 1666 if ((msr_data & 0xfffc) == 0x0) {
1589 kvm_inject_gp(ctxt->vcpu, 0); 1667 kvm_inject_gp(ctxt->vcpu, 0);
1590 return -1; 1668 return X86EMUL_PROPAGATE_FAULT;
1591 } 1669 }
1592 break; 1670 break;
1593 case X86EMUL_MODE_PROT64: 1671 case X86EMUL_MODE_PROT64:
1594 if (msr_data == 0x0) { 1672 if (msr_data == 0x0) {
1595 kvm_inject_gp(ctxt->vcpu, 0); 1673 kvm_inject_gp(ctxt->vcpu, 0);
1596 return -1; 1674 return X86EMUL_PROPAGATE_FAULT;
1597 } 1675 }
1598 break; 1676 break;
1599 } 1677 }
@@ -1618,7 +1696,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1618 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); 1696 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
1619 c->regs[VCPU_REGS_RSP] = msr_data; 1697 c->regs[VCPU_REGS_RSP] = msr_data;
1620 1698
1621 return 0; 1699 return X86EMUL_CONTINUE;
1622} 1700}
1623 1701
1624static int 1702static int
@@ -1629,21 +1707,11 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1629 u64 msr_data; 1707 u64 msr_data;
1630 int usermode; 1708 int usermode;
1631 1709
1632 /* inject #UD if LOCK prefix is used */ 1710 /* inject #GP if in real mode or Virtual 8086 mode */
1633 if (c->lock_prefix) 1711 if (ctxt->mode == X86EMUL_MODE_REAL ||
1634 return -1; 1712 ctxt->mode == X86EMUL_MODE_VM86) {
1635
1636 /* inject #GP if in real mode or paging is disabled */
1637 if (ctxt->mode == X86EMUL_MODE_REAL
1638 || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
1639 kvm_inject_gp(ctxt->vcpu, 0);
1640 return -1;
1641 }
1642
1643 /* sysexit must be called from CPL 0 */
1644 if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
1645 kvm_inject_gp(ctxt->vcpu, 0); 1713 kvm_inject_gp(ctxt->vcpu, 0);
1646 return -1; 1714 return X86EMUL_UNHANDLEABLE;
1647 } 1715 }
1648 1716
1649 setup_syscalls_segments(ctxt, &cs, &ss); 1717 setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1661,7 +1729,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1661 cs.selector = (u16)(msr_data + 16); 1729 cs.selector = (u16)(msr_data + 16);
1662 if ((msr_data & 0xfffc) == 0x0) { 1730 if ((msr_data & 0xfffc) == 0x0) {
1663 kvm_inject_gp(ctxt->vcpu, 0); 1731 kvm_inject_gp(ctxt->vcpu, 0);
1664 return -1; 1732 return X86EMUL_PROPAGATE_FAULT;
1665 } 1733 }
1666 ss.selector = (u16)(msr_data + 24); 1734 ss.selector = (u16)(msr_data + 24);
1667 break; 1735 break;
@@ -1669,7 +1737,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1669 cs.selector = (u16)(msr_data + 32); 1737 cs.selector = (u16)(msr_data + 32);
1670 if (msr_data == 0x0) { 1738 if (msr_data == 0x0) {
1671 kvm_inject_gp(ctxt->vcpu, 0); 1739 kvm_inject_gp(ctxt->vcpu, 0);
1672 return -1; 1740 return X86EMUL_PROPAGATE_FAULT;
1673 } 1741 }
1674 ss.selector = cs.selector + 8; 1742 ss.selector = cs.selector + 8;
1675 cs.db = 0; 1743 cs.db = 0;
@@ -1685,7 +1753,58 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1685 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; 1753 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
1686 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; 1754 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
1687 1755
1688 return 0; 1756 return X86EMUL_CONTINUE;
1757}
1758
1759static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
1760{
1761 int iopl;
1762 if (ctxt->mode == X86EMUL_MODE_REAL)
1763 return false;
1764 if (ctxt->mode == X86EMUL_MODE_VM86)
1765 return true;
1766 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1767 return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl;
1768}
1769
1770static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
1771 struct x86_emulate_ops *ops,
1772 u16 port, u16 len)
1773{
1774 struct kvm_segment tr_seg;
1775 int r;
1776 u16 io_bitmap_ptr;
1777 u8 perm, bit_idx = port & 0x7;
1778 unsigned mask = (1 << len) - 1;
1779
1780 kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
1781 if (tr_seg.unusable)
1782 return false;
1783 if (tr_seg.limit < 103)
1784 return false;
1785 r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
1786 NULL);
1787 if (r != X86EMUL_CONTINUE)
1788 return false;
1789 if (io_bitmap_ptr + port/8 > tr_seg.limit)
1790 return false;
1791 r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
1792 ctxt->vcpu, NULL);
1793 if (r != X86EMUL_CONTINUE)
1794 return false;
1795 if ((perm >> bit_idx) & mask)
1796 return false;
1797 return true;
1798}
1799
1800static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
1801 struct x86_emulate_ops *ops,
1802 u16 port, u16 len)
1803{
1804 if (emulator_bad_iopl(ctxt))
1805 if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
1806 return false;
1807 return true;
1689} 1808}
1690 1809
1691int 1810int
@@ -1709,6 +1828,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1709 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 1828 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1710 saved_eip = c->eip; 1829 saved_eip = c->eip;
1711 1830
1831 /* LOCK prefix is allowed only with some instructions */
1832 if (c->lock_prefix && !(c->d & Lock)) {
1833 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1834 goto done;
1835 }
1836
1837 /* Privileged instruction can be executed only in CPL=0 */
1838 if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) {
1839 kvm_inject_gp(ctxt->vcpu, 0);
1840 goto done;
1841 }
1842
1712 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) 1843 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1713 memop = c->modrm_ea; 1844 memop = c->modrm_ea;
1714 1845
@@ -1749,7 +1880,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1749 &c->src.val, 1880 &c->src.val,
1750 c->src.bytes, 1881 c->src.bytes,
1751 ctxt->vcpu); 1882 ctxt->vcpu);
1752 if (rc != 0) 1883 if (rc != X86EMUL_CONTINUE)
1753 goto done; 1884 goto done;
1754 c->src.orig_val = c->src.val; 1885 c->src.orig_val = c->src.val;
1755 } 1886 }
@@ -1768,12 +1899,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1768 c->dst.ptr = (void *)c->dst.ptr + 1899 c->dst.ptr = (void *)c->dst.ptr +
1769 (c->src.val & mask) / 8; 1900 (c->src.val & mask) / 8;
1770 } 1901 }
1771 if (!(c->d & Mov) && 1902 if (!(c->d & Mov)) {
1772 /* optimisation - avoid slow emulated read */ 1903 /* optimisation - avoid slow emulated read */
1773 ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 1904 rc = ops->read_emulated((unsigned long)c->dst.ptr,
1774 &c->dst.val, 1905 &c->dst.val,
1775 c->dst.bytes, ctxt->vcpu)) != 0)) 1906 c->dst.bytes,
1776 goto done; 1907 ctxt->vcpu);
1908 if (rc != X86EMUL_CONTINUE)
1909 goto done;
1910 }
1777 } 1911 }
1778 c->dst.orig_val = c->dst.val; 1912 c->dst.orig_val = c->dst.val;
1779 1913
@@ -1876,7 +2010,12 @@ special_insn:
1876 break; 2010 break;
1877 case 0x6c: /* insb */ 2011 case 0x6c: /* insb */
1878 case 0x6d: /* insw/insd */ 2012 case 0x6d: /* insw/insd */
1879 if (kvm_emulate_pio_string(ctxt->vcpu, 2013 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2014 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2015 kvm_inject_gp(ctxt->vcpu, 0);
2016 goto done;
2017 }
2018 if (kvm_emulate_pio_string(ctxt->vcpu,
1880 1, 2019 1,
1881 (c->d & ByteOp) ? 1 : c->op_bytes, 2020 (c->d & ByteOp) ? 1 : c->op_bytes,
1882 c->rep_prefix ? 2021 c->rep_prefix ?
@@ -1892,6 +2031,11 @@ special_insn:
1892 return 0; 2031 return 0;
1893 case 0x6e: /* outsb */ 2032 case 0x6e: /* outsb */
1894 case 0x6f: /* outsw/outsd */ 2033 case 0x6f: /* outsw/outsd */
2034 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2035 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2036 kvm_inject_gp(ctxt->vcpu, 0);
2037 goto done;
2038 }
1895 if (kvm_emulate_pio_string(ctxt->vcpu, 2039 if (kvm_emulate_pio_string(ctxt->vcpu,
1896 0, 2040 0,
1897 (c->d & ByteOp) ? 1 : c->op_bytes, 2041 (c->d & ByteOp) ? 1 : c->op_bytes,
@@ -1978,25 +2122,19 @@ special_insn:
1978 break; 2122 break;
1979 case 0x8e: { /* mov seg, r/m16 */ 2123 case 0x8e: { /* mov seg, r/m16 */
1980 uint16_t sel; 2124 uint16_t sel;
1981 int type_bits;
1982 int err;
1983 2125
1984 sel = c->src.val; 2126 sel = c->src.val;
1985 if (c->modrm_reg == VCPU_SREG_SS)
1986 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
1987 2127
1988 if (c->modrm_reg <= 5) { 2128 if (c->modrm_reg == VCPU_SREG_CS ||
1989 type_bits = (c->modrm_reg == 1) ? 9 : 1; 2129 c->modrm_reg > VCPU_SREG_GS) {
1990 err = kvm_load_segment_descriptor(ctxt->vcpu, sel, 2130 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1991 type_bits, c->modrm_reg); 2131 goto done;
1992 } else {
1993 printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
1994 c->modrm);
1995 goto cannot_emulate;
1996 } 2132 }
1997 2133
1998 if (err < 0) 2134 if (c->modrm_reg == VCPU_SREG_SS)
1999 goto cannot_emulate; 2135 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
2136
2137 rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
2000 2138
2001 c->dst.type = OP_NONE; /* Disable writeback. */ 2139 c->dst.type = OP_NONE; /* Disable writeback. */
2002 break; 2140 break;
@@ -2025,7 +2163,10 @@ special_insn:
2025 c->dst.type = OP_REG; 2163 c->dst.type = OP_REG;
2026 c->dst.ptr = (unsigned long *) &ctxt->eflags; 2164 c->dst.ptr = (unsigned long *) &ctxt->eflags;
2027 c->dst.bytes = c->op_bytes; 2165 c->dst.bytes = c->op_bytes;
2028 goto pop_instruction; 2166 rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
2167 if (rc != X86EMUL_CONTINUE)
2168 goto done;
2169 break;
2029 case 0xa0 ... 0xa1: /* mov */ 2170 case 0xa0 ... 0xa1: /* mov */
2030 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 2171 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
2031 c->dst.val = c->src.val; 2172 c->dst.val = c->src.val;
@@ -2039,11 +2180,12 @@ special_insn:
2039 c->dst.ptr = (unsigned long *)register_address(c, 2180 c->dst.ptr = (unsigned long *)register_address(c,
2040 es_base(ctxt), 2181 es_base(ctxt),
2041 c->regs[VCPU_REGS_RDI]); 2182 c->regs[VCPU_REGS_RDI]);
2042 if ((rc = ops->read_emulated(register_address(c, 2183 rc = ops->read_emulated(register_address(c,
2043 seg_override_base(ctxt, c), 2184 seg_override_base(ctxt, c),
2044 c->regs[VCPU_REGS_RSI]), 2185 c->regs[VCPU_REGS_RSI]),
2045 &c->dst.val, 2186 &c->dst.val,
2046 c->dst.bytes, ctxt->vcpu)) != 0) 2187 c->dst.bytes, ctxt->vcpu);
2188 if (rc != X86EMUL_CONTINUE)
2047 goto done; 2189 goto done;
2048 register_address_increment(c, &c->regs[VCPU_REGS_RSI], 2190 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2049 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes 2191 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
@@ -2058,10 +2200,11 @@ special_insn:
2058 c->src.ptr = (unsigned long *)register_address(c, 2200 c->src.ptr = (unsigned long *)register_address(c,
2059 seg_override_base(ctxt, c), 2201 seg_override_base(ctxt, c),
2060 c->regs[VCPU_REGS_RSI]); 2202 c->regs[VCPU_REGS_RSI]);
2061 if ((rc = ops->read_emulated((unsigned long)c->src.ptr, 2203 rc = ops->read_emulated((unsigned long)c->src.ptr,
2062 &c->src.val, 2204 &c->src.val,
2063 c->src.bytes, 2205 c->src.bytes,
2064 ctxt->vcpu)) != 0) 2206 ctxt->vcpu);
2207 if (rc != X86EMUL_CONTINUE)
2065 goto done; 2208 goto done;
2066 2209
2067 c->dst.type = OP_NONE; /* Disable writeback. */ 2210 c->dst.type = OP_NONE; /* Disable writeback. */
@@ -2069,10 +2212,11 @@ special_insn:
2069 c->dst.ptr = (unsigned long *)register_address(c, 2212 c->dst.ptr = (unsigned long *)register_address(c,
2070 es_base(ctxt), 2213 es_base(ctxt),
2071 c->regs[VCPU_REGS_RDI]); 2214 c->regs[VCPU_REGS_RDI]);
2072 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 2215 rc = ops->read_emulated((unsigned long)c->dst.ptr,
2073 &c->dst.val, 2216 &c->dst.val,
2074 c->dst.bytes, 2217 c->dst.bytes,
2075 ctxt->vcpu)) != 0) 2218 ctxt->vcpu);
2219 if (rc != X86EMUL_CONTINUE)
2076 goto done; 2220 goto done;
2077 2221
2078 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 2222 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
@@ -2102,12 +2246,13 @@ special_insn:
2102 c->dst.type = OP_REG; 2246 c->dst.type = OP_REG;
2103 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2247 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2104 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 2248 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
2105 if ((rc = ops->read_emulated(register_address(c, 2249 rc = ops->read_emulated(register_address(c,
2106 seg_override_base(ctxt, c), 2250 seg_override_base(ctxt, c),
2107 c->regs[VCPU_REGS_RSI]), 2251 c->regs[VCPU_REGS_RSI]),
2108 &c->dst.val, 2252 &c->dst.val,
2109 c->dst.bytes, 2253 c->dst.bytes,
2110 ctxt->vcpu)) != 0) 2254 ctxt->vcpu);
2255 if (rc != X86EMUL_CONTINUE)
2111 goto done; 2256 goto done;
2112 register_address_increment(c, &c->regs[VCPU_REGS_RSI], 2257 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2113 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes 2258 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
@@ -2163,11 +2308,9 @@ special_insn:
2163 case 0xe9: /* jmp rel */ 2308 case 0xe9: /* jmp rel */
2164 goto jmp; 2309 goto jmp;
2165 case 0xea: /* jmp far */ 2310 case 0xea: /* jmp far */
2166 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, 2311 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
2167 VCPU_SREG_CS) < 0) { 2312 VCPU_SREG_CS))
2168 DPRINTF("jmp far: Failed to load CS descriptor\n"); 2313 goto done;
2169 goto cannot_emulate;
2170 }
2171 2314
2172 c->eip = c->src.val; 2315 c->eip = c->src.val;
2173 break; 2316 break;
@@ -2185,7 +2328,13 @@ special_insn:
2185 case 0xef: /* out (e/r)ax,dx */ 2328 case 0xef: /* out (e/r)ax,dx */
2186 port = c->regs[VCPU_REGS_RDX]; 2329 port = c->regs[VCPU_REGS_RDX];
2187 io_dir_in = 0; 2330 io_dir_in = 0;
2188 do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, 2331 do_io:
2332 if (!emulator_io_permited(ctxt, ops, port,
2333 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2334 kvm_inject_gp(ctxt->vcpu, 0);
2335 goto done;
2336 }
2337 if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
2189 (c->d & ByteOp) ? 1 : c->op_bytes, 2338 (c->d & ByteOp) ? 1 : c->op_bytes,
2190 port) != 0) { 2339 port) != 0) {
2191 c->eip = saved_eip; 2340 c->eip = saved_eip;
@@ -2210,13 +2359,21 @@ special_insn:
2210 c->dst.type = OP_NONE; /* Disable writeback. */ 2359 c->dst.type = OP_NONE; /* Disable writeback. */
2211 break; 2360 break;
2212 case 0xfa: /* cli */ 2361 case 0xfa: /* cli */
2213 ctxt->eflags &= ~X86_EFLAGS_IF; 2362 if (emulator_bad_iopl(ctxt))
2214 c->dst.type = OP_NONE; /* Disable writeback. */ 2363 kvm_inject_gp(ctxt->vcpu, 0);
2364 else {
2365 ctxt->eflags &= ~X86_EFLAGS_IF;
2366 c->dst.type = OP_NONE; /* Disable writeback. */
2367 }
2215 break; 2368 break;
2216 case 0xfb: /* sti */ 2369 case 0xfb: /* sti */
2217 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); 2370 if (emulator_bad_iopl(ctxt))
2218 ctxt->eflags |= X86_EFLAGS_IF; 2371 kvm_inject_gp(ctxt->vcpu, 0);
2219 c->dst.type = OP_NONE; /* Disable writeback. */ 2372 else {
2373 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
2374 ctxt->eflags |= X86_EFLAGS_IF;
2375 c->dst.type = OP_NONE; /* Disable writeback. */
2376 }
2220 break; 2377 break;
2221 case 0xfc: /* cld */ 2378 case 0xfc: /* cld */
2222 ctxt->eflags &= ~EFLG_DF; 2379 ctxt->eflags &= ~EFLG_DF;
@@ -2319,8 +2476,9 @@ twobyte_insn:
2319 } 2476 }
2320 break; 2477 break;
2321 case 0x05: /* syscall */ 2478 case 0x05: /* syscall */
2322 if (emulate_syscall(ctxt) == -1) 2479 rc = emulate_syscall(ctxt);
2323 goto cannot_emulate; 2480 if (rc != X86EMUL_CONTINUE)
2481 goto done;
2324 else 2482 else
2325 goto writeback; 2483 goto writeback;
2326 break; 2484 break;
@@ -2391,14 +2549,16 @@ twobyte_insn:
2391 c->dst.type = OP_NONE; 2549 c->dst.type = OP_NONE;
2392 break; 2550 break;
2393 case 0x34: /* sysenter */ 2551 case 0x34: /* sysenter */
2394 if (emulate_sysenter(ctxt) == -1) 2552 rc = emulate_sysenter(ctxt);
2395 goto cannot_emulate; 2553 if (rc != X86EMUL_CONTINUE)
2554 goto done;
2396 else 2555 else
2397 goto writeback; 2556 goto writeback;
2398 break; 2557 break;
2399 case 0x35: /* sysexit */ 2558 case 0x35: /* sysexit */
2400 if (emulate_sysexit(ctxt) == -1) 2559 rc = emulate_sysexit(ctxt);
2401 goto cannot_emulate; 2560 if (rc != X86EMUL_CONTINUE)
2561 goto done;
2402 else 2562 else
2403 goto writeback; 2563 goto writeback;
2404 break; 2564 break;
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 296aba49472a..294698b6daff 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -242,11 +242,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
242{ 242{
243 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, 243 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
244 irq_ack_notifier); 244 irq_ack_notifier);
245 spin_lock(&ps->inject_lock); 245 raw_spin_lock(&ps->inject_lock);
246 if (atomic_dec_return(&ps->pit_timer.pending) < 0) 246 if (atomic_dec_return(&ps->pit_timer.pending) < 0)
247 atomic_inc(&ps->pit_timer.pending); 247 atomic_inc(&ps->pit_timer.pending);
248 ps->irq_ack = 1; 248 ps->irq_ack = 1;
249 spin_unlock(&ps->inject_lock); 249 raw_spin_unlock(&ps->inject_lock);
250} 250}
251 251
252void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) 252void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -467,6 +467,9 @@ static int pit_ioport_read(struct kvm_io_device *this,
467 return -EOPNOTSUPP; 467 return -EOPNOTSUPP;
468 468
469 addr &= KVM_PIT_CHANNEL_MASK; 469 addr &= KVM_PIT_CHANNEL_MASK;
470 if (addr == 3)
471 return 0;
472
470 s = &pit_state->channels[addr]; 473 s = &pit_state->channels[addr];
471 474
472 mutex_lock(&pit_state->lock); 475 mutex_lock(&pit_state->lock);
@@ -602,7 +605,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = {
602 .write = speaker_ioport_write, 605 .write = speaker_ioport_write,
603}; 606};
604 607
605/* Caller must have writers lock on slots_lock */ 608/* Caller must hold slots_lock */
606struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) 609struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
607{ 610{
608 struct kvm_pit *pit; 611 struct kvm_pit *pit;
@@ -621,7 +624,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
621 624
622 mutex_init(&pit->pit_state.lock); 625 mutex_init(&pit->pit_state.lock);
623 mutex_lock(&pit->pit_state.lock); 626 mutex_lock(&pit->pit_state.lock);
624 spin_lock_init(&pit->pit_state.inject_lock); 627 raw_spin_lock_init(&pit->pit_state.inject_lock);
625 628
626 kvm->arch.vpit = pit; 629 kvm->arch.vpit = pit;
627 pit->kvm = kvm; 630 pit->kvm = kvm;
@@ -642,13 +645,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
642 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); 645 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
643 646
644 kvm_iodevice_init(&pit->dev, &pit_dev_ops); 647 kvm_iodevice_init(&pit->dev, &pit_dev_ops);
645 ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); 648 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);
646 if (ret < 0) 649 if (ret < 0)
647 goto fail; 650 goto fail;
648 651
649 if (flags & KVM_PIT_SPEAKER_DUMMY) { 652 if (flags & KVM_PIT_SPEAKER_DUMMY) {
650 kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); 653 kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
651 ret = __kvm_io_bus_register_dev(&kvm->pio_bus, 654 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
652 &pit->speaker_dev); 655 &pit->speaker_dev);
653 if (ret < 0) 656 if (ret < 0)
654 goto fail_unregister; 657 goto fail_unregister;
@@ -657,11 +660,12 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
657 return pit; 660 return pit;
658 661
659fail_unregister: 662fail_unregister:
660 __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); 663 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
661 664
662fail: 665fail:
663 if (pit->irq_source_id >= 0) 666 kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
664 kvm_free_irq_source_id(kvm, pit->irq_source_id); 667 kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
668 kvm_free_irq_source_id(kvm, pit->irq_source_id);
665 669
666 kfree(pit); 670 kfree(pit);
667 return NULL; 671 return NULL;
@@ -720,12 +724,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
720 /* Try to inject pending interrupts when 724 /* Try to inject pending interrupts when
721 * last one has been acked. 725 * last one has been acked.
722 */ 726 */
723 spin_lock(&ps->inject_lock); 727 raw_spin_lock(&ps->inject_lock);
724 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { 728 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
725 ps->irq_ack = 0; 729 ps->irq_ack = 0;
726 inject = 1; 730 inject = 1;
727 } 731 }
728 spin_unlock(&ps->inject_lock); 732 raw_spin_unlock(&ps->inject_lock);
729 if (inject) 733 if (inject)
730 __inject_pit_timer_intr(kvm); 734 __inject_pit_timer_intr(kvm);
731 } 735 }
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index d4c1c7ffdc09..900d6b0ba7c2 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
27 u32 speaker_data_on; 27 u32 speaker_data_on;
28 struct mutex lock; 28 struct mutex lock;
29 struct kvm_pit *pit; 29 struct kvm_pit *pit;
30 spinlock_t inject_lock; 30 raw_spinlock_t inject_lock;
31 unsigned long irq_ack; 31 unsigned long irq_ack;
32 struct kvm_irq_ack_notifier irq_ack_notifier; 32 struct kvm_irq_ack_notifier irq_ack_notifier;
33}; 33};
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index d057c0cbd245..07771da85de5 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -44,18 +44,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
44 * Other interrupt may be delivered to PIC while lock is dropped but 44 * Other interrupt may be delivered to PIC while lock is dropped but
45 * it should be safe since PIC state is already updated at this stage. 45 * it should be safe since PIC state is already updated at this stage.
46 */ 46 */
47 spin_unlock(&s->pics_state->lock); 47 raw_spin_unlock(&s->pics_state->lock);
48 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); 48 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
49 spin_lock(&s->pics_state->lock); 49 raw_spin_lock(&s->pics_state->lock);
50} 50}
51 51
52void kvm_pic_clear_isr_ack(struct kvm *kvm) 52void kvm_pic_clear_isr_ack(struct kvm *kvm)
53{ 53{
54 struct kvm_pic *s = pic_irqchip(kvm); 54 struct kvm_pic *s = pic_irqchip(kvm);
55 spin_lock(&s->lock); 55
56 raw_spin_lock(&s->lock);
56 s->pics[0].isr_ack = 0xff; 57 s->pics[0].isr_ack = 0xff;
57 s->pics[1].isr_ack = 0xff; 58 s->pics[1].isr_ack = 0xff;
58 spin_unlock(&s->lock); 59 raw_spin_unlock(&s->lock);
59} 60}
60 61
61/* 62/*
@@ -156,9 +157,9 @@ static void pic_update_irq(struct kvm_pic *s)
156 157
157void kvm_pic_update_irq(struct kvm_pic *s) 158void kvm_pic_update_irq(struct kvm_pic *s)
158{ 159{
159 spin_lock(&s->lock); 160 raw_spin_lock(&s->lock);
160 pic_update_irq(s); 161 pic_update_irq(s);
161 spin_unlock(&s->lock); 162 raw_spin_unlock(&s->lock);
162} 163}
163 164
164int kvm_pic_set_irq(void *opaque, int irq, int level) 165int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -166,14 +167,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
166 struct kvm_pic *s = opaque; 167 struct kvm_pic *s = opaque;
167 int ret = -1; 168 int ret = -1;
168 169
169 spin_lock(&s->lock); 170 raw_spin_lock(&s->lock);
170 if (irq >= 0 && irq < PIC_NUM_PINS) { 171 if (irq >= 0 && irq < PIC_NUM_PINS) {
171 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 172 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
172 pic_update_irq(s); 173 pic_update_irq(s);
173 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 174 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
174 s->pics[irq >> 3].imr, ret == 0); 175 s->pics[irq >> 3].imr, ret == 0);
175 } 176 }
176 spin_unlock(&s->lock); 177 raw_spin_unlock(&s->lock);
177 178
178 return ret; 179 return ret;
179} 180}
@@ -203,7 +204,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
203 int irq, irq2, intno; 204 int irq, irq2, intno;
204 struct kvm_pic *s = pic_irqchip(kvm); 205 struct kvm_pic *s = pic_irqchip(kvm);
205 206
206 spin_lock(&s->lock); 207 raw_spin_lock(&s->lock);
207 irq = pic_get_irq(&s->pics[0]); 208 irq = pic_get_irq(&s->pics[0]);
208 if (irq >= 0) { 209 if (irq >= 0) {
209 pic_intack(&s->pics[0], irq); 210 pic_intack(&s->pics[0], irq);
@@ -228,7 +229,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
228 intno = s->pics[0].irq_base + irq; 229 intno = s->pics[0].irq_base + irq;
229 } 230 }
230 pic_update_irq(s); 231 pic_update_irq(s);
231 spin_unlock(&s->lock); 232 raw_spin_unlock(&s->lock);
232 233
233 return intno; 234 return intno;
234} 235}
@@ -442,7 +443,7 @@ static int picdev_write(struct kvm_io_device *this,
442 printk(KERN_ERR "PIC: non byte write\n"); 443 printk(KERN_ERR "PIC: non byte write\n");
443 return 0; 444 return 0;
444 } 445 }
445 spin_lock(&s->lock); 446 raw_spin_lock(&s->lock);
446 switch (addr) { 447 switch (addr) {
447 case 0x20: 448 case 0x20:
448 case 0x21: 449 case 0x21:
@@ -455,7 +456,7 @@ static int picdev_write(struct kvm_io_device *this,
455 elcr_ioport_write(&s->pics[addr & 1], addr, data); 456 elcr_ioport_write(&s->pics[addr & 1], addr, data);
456 break; 457 break;
457 } 458 }
458 spin_unlock(&s->lock); 459 raw_spin_unlock(&s->lock);
459 return 0; 460 return 0;
460} 461}
461 462
@@ -472,7 +473,7 @@ static int picdev_read(struct kvm_io_device *this,
472 printk(KERN_ERR "PIC: non byte read\n"); 473 printk(KERN_ERR "PIC: non byte read\n");
473 return 0; 474 return 0;
474 } 475 }
475 spin_lock(&s->lock); 476 raw_spin_lock(&s->lock);
476 switch (addr) { 477 switch (addr) {
477 case 0x20: 478 case 0x20:
478 case 0x21: 479 case 0x21:
@@ -486,7 +487,7 @@ static int picdev_read(struct kvm_io_device *this,
486 break; 487 break;
487 } 488 }
488 *(unsigned char *)val = data; 489 *(unsigned char *)val = data;
489 spin_unlock(&s->lock); 490 raw_spin_unlock(&s->lock);
490 return 0; 491 return 0;
491} 492}
492 493
@@ -520,7 +521,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
520 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); 521 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
521 if (!s) 522 if (!s)
522 return NULL; 523 return NULL;
523 spin_lock_init(&s->lock); 524 raw_spin_lock_init(&s->lock);
524 s->kvm = kvm; 525 s->kvm = kvm;
525 s->pics[0].elcr_mask = 0xf8; 526 s->pics[0].elcr_mask = 0xf8;
526 s->pics[1].elcr_mask = 0xde; 527 s->pics[1].elcr_mask = 0xde;
@@ -533,7 +534,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
533 * Initialize PIO device 534 * Initialize PIO device
534 */ 535 */
535 kvm_iodevice_init(&s->dev, &picdev_ops); 536 kvm_iodevice_init(&s->dev, &picdev_ops);
536 ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); 537 mutex_lock(&kvm->slots_lock);
538 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev);
539 mutex_unlock(&kvm->slots_lock);
537 if (ret < 0) { 540 if (ret < 0) {
538 kfree(s); 541 kfree(s);
539 return NULL; 542 return NULL;
@@ -541,3 +544,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
541 544
542 return s; 545 return s;
543} 546}
547
548void kvm_destroy_pic(struct kvm *kvm)
549{
550 struct kvm_pic *vpic = kvm->arch.vpic;
551
552 if (vpic) {
553 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev);
554 kvm->arch.vpic = NULL;
555 kfree(vpic);
556 }
557}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index be399e207d57..34b15915754d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -62,7 +62,7 @@ struct kvm_kpic_state {
62}; 62};
63 63
64struct kvm_pic { 64struct kvm_pic {
65 spinlock_t lock; 65 raw_spinlock_t lock;
66 unsigned pending_acks; 66 unsigned pending_acks;
67 struct kvm *kvm; 67 struct kvm *kvm;
68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
@@ -75,6 +75,7 @@ struct kvm_pic {
75}; 75};
76 76
77struct kvm_pic *kvm_create_pic(struct kvm *kvm); 77struct kvm_pic *kvm_create_pic(struct kvm *kvm);
78void kvm_destroy_pic(struct kvm *kvm);
78int kvm_pic_read_irq(struct kvm *kvm); 79int kvm_pic_read_irq(struct kvm *kvm);
79void kvm_pic_update_irq(struct kvm_pic *s); 80void kvm_pic_update_irq(struct kvm_pic *s);
80void kvm_pic_clear_isr_ack(struct kvm *kvm); 81void kvm_pic_clear_isr_ack(struct kvm *kvm);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 7bcc5b6a4403..cff851cf5322 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -1,6 +1,11 @@
1#ifndef ASM_KVM_CACHE_REGS_H 1#ifndef ASM_KVM_CACHE_REGS_H
2#define ASM_KVM_CACHE_REGS_H 2#define ASM_KVM_CACHE_REGS_H
3 3
4#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
5#define KVM_POSSIBLE_CR4_GUEST_BITS \
6 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
7 | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
8
4static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, 9static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
5 enum kvm_reg reg) 10 enum kvm_reg reg)
6{ 11{
@@ -38,4 +43,30 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
38 return vcpu->arch.pdptrs[index]; 43 return vcpu->arch.pdptrs[index];
39} 44}
40 45
46static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
47{
48 ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
49 if (tmask & vcpu->arch.cr0_guest_owned_bits)
50 kvm_x86_ops->decache_cr0_guest_bits(vcpu);
51 return vcpu->arch.cr0 & mask;
52}
53
54static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
55{
56 return kvm_read_cr0_bits(vcpu, ~0UL);
57}
58
59static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
60{
61 ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
62 if (tmask & vcpu->arch.cr4_guest_owned_bits)
63 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
64 return vcpu->arch.cr4 & mask;
65}
66
67static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
68{
69 return kvm_read_cr4_bits(vcpu, ~0UL);
70}
71
41#endif 72#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ba8c045da782..4b224f90087b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1246,3 +1246,34 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
1246 1246
1247 return 0; 1247 return 0;
1248} 1248}
1249
1250int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
1251{
1252 struct kvm_lapic *apic = vcpu->arch.apic;
1253
1254 if (!irqchip_in_kernel(vcpu->kvm))
1255 return 1;
1256
1257 /* if this is ICR write vector before command */
1258 if (reg == APIC_ICR)
1259 apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
1260 return apic_reg_write(apic, reg, (u32)data);
1261}
1262
1263int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
1264{
1265 struct kvm_lapic *apic = vcpu->arch.apic;
1266 u32 low, high = 0;
1267
1268 if (!irqchip_in_kernel(vcpu->kvm))
1269 return 1;
1270
1271 if (apic_reg_read(apic, reg, 4, &low))
1272 return 1;
1273 if (reg == APIC_ICR)
1274 apic_reg_read(apic, APIC_ICR2, 4, &high);
1275
1276 *data = (((u64)high) << 32) | low;
1277
1278 return 0;
1279}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 40010b09c4aa..f5fe32c5edad 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -48,4 +48,12 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
48 48
49int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); 49int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
50int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); 50int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
51
52int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
53int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
54
55static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
56{
57 return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
58}
51#endif 59#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 89a49fb46a27..741373e8ca77 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include "mmu.h" 20#include "mmu.h"
21#include "x86.h"
21#include "kvm_cache_regs.h" 22#include "kvm_cache_regs.h"
22 23
23#include <linux/kvm_host.h> 24#include <linux/kvm_host.h>
@@ -29,6 +30,7 @@
29#include <linux/swap.h> 30#include <linux/swap.h>
30#include <linux/hugetlb.h> 31#include <linux/hugetlb.h>
31#include <linux/compiler.h> 32#include <linux/compiler.h>
33#include <linux/srcu.h>
32 34
33#include <asm/page.h> 35#include <asm/page.h>
34#include <asm/cmpxchg.h> 36#include <asm/cmpxchg.h>
@@ -136,16 +138,6 @@ module_param(oos_shadow, bool, 0644);
136#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 138#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
137 | PT64_NX_MASK) 139 | PT64_NX_MASK)
138 140
139#define PFERR_PRESENT_MASK (1U << 0)
140#define PFERR_WRITE_MASK (1U << 1)
141#define PFERR_USER_MASK (1U << 2)
142#define PFERR_RSVD_MASK (1U << 3)
143#define PFERR_FETCH_MASK (1U << 4)
144
145#define PT_PDPE_LEVEL 3
146#define PT_DIRECTORY_LEVEL 2
147#define PT_PAGE_TABLE_LEVEL 1
148
149#define RMAP_EXT 4 141#define RMAP_EXT 4
150 142
151#define ACC_EXEC_MASK 1 143#define ACC_EXEC_MASK 1
@@ -153,6 +145,9 @@ module_param(oos_shadow, bool, 0644);
153#define ACC_USER_MASK PT_USER_MASK 145#define ACC_USER_MASK PT_USER_MASK
154#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 146#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
155 147
148#include <trace/events/kvm.h>
149
150#undef TRACE_INCLUDE_FILE
156#define CREATE_TRACE_POINTS 151#define CREATE_TRACE_POINTS
157#include "mmutrace.h" 152#include "mmutrace.h"
158 153
@@ -229,7 +224,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
229 224
230static int is_write_protection(struct kvm_vcpu *vcpu) 225static int is_write_protection(struct kvm_vcpu *vcpu)
231{ 226{
232 return vcpu->arch.cr0 & X86_CR0_WP; 227 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
233} 228}
234 229
235static int is_cpuid_PSE36(void) 230static int is_cpuid_PSE36(void)
@@ -239,7 +234,7 @@ static int is_cpuid_PSE36(void)
239 234
240static int is_nx(struct kvm_vcpu *vcpu) 235static int is_nx(struct kvm_vcpu *vcpu)
241{ 236{
242 return vcpu->arch.shadow_efer & EFER_NX; 237 return vcpu->arch.efer & EFER_NX;
243} 238}
244 239
245static int is_shadow_present_pte(u64 pte) 240static int is_shadow_present_pte(u64 pte)
@@ -253,7 +248,7 @@ static int is_large_pte(u64 pte)
253 return pte & PT_PAGE_SIZE_MASK; 248 return pte & PT_PAGE_SIZE_MASK;
254} 249}
255 250
256static int is_writeble_pte(unsigned long pte) 251static int is_writable_pte(unsigned long pte)
257{ 252{
258 return pte & PT_WRITABLE_MASK; 253 return pte & PT_WRITABLE_MASK;
259} 254}
@@ -470,24 +465,10 @@ static int has_wrprotected_page(struct kvm *kvm,
470 465
471static int host_mapping_level(struct kvm *kvm, gfn_t gfn) 466static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
472{ 467{
473 unsigned long page_size = PAGE_SIZE; 468 unsigned long page_size;
474 struct vm_area_struct *vma;
475 unsigned long addr;
476 int i, ret = 0; 469 int i, ret = 0;
477 470
478 addr = gfn_to_hva(kvm, gfn); 471 page_size = kvm_host_page_size(kvm, gfn);
479 if (kvm_is_error_hva(addr))
480 return PT_PAGE_TABLE_LEVEL;
481
482 down_read(&current->mm->mmap_sem);
483 vma = find_vma(current->mm, addr);
484 if (!vma)
485 goto out;
486
487 page_size = vma_kernel_pagesize(vma);
488
489out:
490 up_read(&current->mm->mmap_sem);
491 472
492 for (i = PT_PAGE_TABLE_LEVEL; 473 for (i = PT_PAGE_TABLE_LEVEL;
493 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { 474 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
@@ -503,8 +484,7 @@ out:
503static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 484static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
504{ 485{
505 struct kvm_memory_slot *slot; 486 struct kvm_memory_slot *slot;
506 int host_level; 487 int host_level, level, max_level;
507 int level = PT_PAGE_TABLE_LEVEL;
508 488
509 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 489 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
510 if (slot && slot->dirty_bitmap) 490 if (slot && slot->dirty_bitmap)
@@ -515,7 +495,10 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
515 if (host_level == PT_PAGE_TABLE_LEVEL) 495 if (host_level == PT_PAGE_TABLE_LEVEL)
516 return host_level; 496 return host_level;
517 497
518 for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) 498 max_level = kvm_x86_ops->get_lpage_level() < host_level ?
499 kvm_x86_ops->get_lpage_level() : host_level;
500
501 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
519 if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) 502 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
520 break; 503 break;
521 504
@@ -633,7 +616,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
633 pfn = spte_to_pfn(*spte); 616 pfn = spte_to_pfn(*spte);
634 if (*spte & shadow_accessed_mask) 617 if (*spte & shadow_accessed_mask)
635 kvm_set_pfn_accessed(pfn); 618 kvm_set_pfn_accessed(pfn);
636 if (is_writeble_pte(*spte)) 619 if (is_writable_pte(*spte))
637 kvm_set_pfn_dirty(pfn); 620 kvm_set_pfn_dirty(pfn);
638 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); 621 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
639 if (!*rmapp) { 622 if (!*rmapp) {
@@ -662,6 +645,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
662 prev_desc = desc; 645 prev_desc = desc;
663 desc = desc->more; 646 desc = desc->more;
664 } 647 }
648 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
665 BUG(); 649 BUG();
666 } 650 }
667} 651}
@@ -708,7 +692,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
708 BUG_ON(!spte); 692 BUG_ON(!spte);
709 BUG_ON(!(*spte & PT_PRESENT_MASK)); 693 BUG_ON(!(*spte & PT_PRESENT_MASK));
710 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 694 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
711 if (is_writeble_pte(*spte)) { 695 if (is_writable_pte(*spte)) {
712 __set_spte(spte, *spte & ~PT_WRITABLE_MASK); 696 __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
713 write_protected = 1; 697 write_protected = 1;
714 } 698 }
@@ -732,7 +716,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
732 BUG_ON(!(*spte & PT_PRESENT_MASK)); 716 BUG_ON(!(*spte & PT_PRESENT_MASK));
733 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 717 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
734 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 718 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
735 if (is_writeble_pte(*spte)) { 719 if (is_writable_pte(*spte)) {
736 rmap_remove(kvm, spte); 720 rmap_remove(kvm, spte);
737 --kvm->stat.lpages; 721 --kvm->stat.lpages;
738 __set_spte(spte, shadow_trap_nonpresent_pte); 722 __set_spte(spte, shadow_trap_nonpresent_pte);
@@ -787,7 +771,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
787 771
788 new_spte &= ~PT_WRITABLE_MASK; 772 new_spte &= ~PT_WRITABLE_MASK;
789 new_spte &= ~SPTE_HOST_WRITEABLE; 773 new_spte &= ~SPTE_HOST_WRITEABLE;
790 if (is_writeble_pte(*spte)) 774 if (is_writable_pte(*spte))
791 kvm_set_pfn_dirty(spte_to_pfn(*spte)); 775 kvm_set_pfn_dirty(spte_to_pfn(*spte));
792 __set_spte(spte, new_spte); 776 __set_spte(spte, new_spte);
793 spte = rmap_next(kvm, rmapp, spte); 777 spte = rmap_next(kvm, rmapp, spte);
@@ -805,35 +789,32 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
805 unsigned long data)) 789 unsigned long data))
806{ 790{
807 int i, j; 791 int i, j;
792 int ret;
808 int retval = 0; 793 int retval = 0;
794 struct kvm_memslots *slots;
809 795
810 /* 796 slots = rcu_dereference(kvm->memslots);
811 * If mmap_sem isn't taken, we can look the memslots with only 797
812 * the mmu_lock by skipping over the slots with userspace_addr == 0. 798 for (i = 0; i < slots->nmemslots; i++) {
813 */ 799 struct kvm_memory_slot *memslot = &slots->memslots[i];
814 for (i = 0; i < kvm->nmemslots; i++) {
815 struct kvm_memory_slot *memslot = &kvm->memslots[i];
816 unsigned long start = memslot->userspace_addr; 800 unsigned long start = memslot->userspace_addr;
817 unsigned long end; 801 unsigned long end;
818 802
819 /* mmu_lock protects userspace_addr */
820 if (!start)
821 continue;
822
823 end = start + (memslot->npages << PAGE_SHIFT); 803 end = start + (memslot->npages << PAGE_SHIFT);
824 if (hva >= start && hva < end) { 804 if (hva >= start && hva < end) {
825 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 805 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
826 806
827 retval |= handler(kvm, &memslot->rmap[gfn_offset], 807 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
828 data);
829 808
830 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 809 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
831 int idx = gfn_offset; 810 int idx = gfn_offset;
832 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); 811 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
833 retval |= handler(kvm, 812 ret |= handler(kvm,
834 &memslot->lpage_info[j][idx].rmap_pde, 813 &memslot->lpage_info[j][idx].rmap_pde,
835 data); 814 data);
836 } 815 }
816 trace_kvm_age_page(hva, memslot, ret);
817 retval |= ret;
837 } 818 }
838 } 819 }
839 820
@@ -856,9 +837,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
856 u64 *spte; 837 u64 *spte;
857 int young = 0; 838 int young = 0;
858 839
859 /* always return old for EPT */ 840 /*
841 * Emulate the accessed bit for EPT, by checking if this page has
842 * an EPT mapping, and clearing it if it does. On the next access,
843 * a new EPT mapping will be established.
844 * This has some overhead, but not as much as the cost of swapping
845 * out actively used pages or breaking up actively used hugepages.
846 */
860 if (!shadow_accessed_mask) 847 if (!shadow_accessed_mask)
861 return 0; 848 return kvm_unmap_rmapp(kvm, rmapp, data);
862 849
863 spte = rmap_next(kvm, rmapp, NULL); 850 spte = rmap_next(kvm, rmapp, NULL);
864 while (spte) { 851 while (spte) {
@@ -1615,7 +1602,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1615 1602
1616static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) 1603static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1617{ 1604{
1618 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); 1605 int slot = memslot_id(kvm, gfn);
1619 struct kvm_mmu_page *sp = page_header(__pa(pte)); 1606 struct kvm_mmu_page *sp = page_header(__pa(pte));
1620 1607
1621 __set_bit(slot, sp->slot_bitmap); 1608 __set_bit(slot, sp->slot_bitmap);
@@ -1639,7 +1626,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1639{ 1626{
1640 struct page *page; 1627 struct page *page;
1641 1628
1642 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); 1629 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
1643 1630
1644 if (gpa == UNMAPPED_GVA) 1631 if (gpa == UNMAPPED_GVA)
1645 return NULL; 1632 return NULL;
@@ -1852,7 +1839,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1852 * is responsibility of mmu_get_page / kvm_sync_page. 1839 * is responsibility of mmu_get_page / kvm_sync_page.
1853 * Same reasoning can be applied to dirty page accounting. 1840 * Same reasoning can be applied to dirty page accounting.
1854 */ 1841 */
1855 if (!can_unsync && is_writeble_pte(*sptep)) 1842 if (!can_unsync && is_writable_pte(*sptep))
1856 goto set_pte; 1843 goto set_pte;
1857 1844
1858 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 1845 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
@@ -1860,7 +1847,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1860 __func__, gfn); 1847 __func__, gfn);
1861 ret = 1; 1848 ret = 1;
1862 pte_access &= ~ACC_WRITE_MASK; 1849 pte_access &= ~ACC_WRITE_MASK;
1863 if (is_writeble_pte(spte)) 1850 if (is_writable_pte(spte))
1864 spte &= ~PT_WRITABLE_MASK; 1851 spte &= ~PT_WRITABLE_MASK;
1865 } 1852 }
1866 } 1853 }
@@ -1881,7 +1868,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1881 bool reset_host_protection) 1868 bool reset_host_protection)
1882{ 1869{
1883 int was_rmapped = 0; 1870 int was_rmapped = 0;
1884 int was_writeble = is_writeble_pte(*sptep); 1871 int was_writable = is_writable_pte(*sptep);
1885 int rmap_count; 1872 int rmap_count;
1886 1873
1887 pgprintk("%s: spte %llx access %x write_fault %d" 1874 pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1932,7 +1919,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1932 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 1919 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1933 rmap_recycle(vcpu, sptep, gfn); 1920 rmap_recycle(vcpu, sptep, gfn);
1934 } else { 1921 } else {
1935 if (was_writeble) 1922 if (was_writable)
1936 kvm_release_pfn_dirty(pfn); 1923 kvm_release_pfn_dirty(pfn);
1937 else 1924 else
1938 kvm_release_pfn_clean(pfn); 1925 kvm_release_pfn_clean(pfn);
@@ -2162,8 +2149,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2162 spin_unlock(&vcpu->kvm->mmu_lock); 2149 spin_unlock(&vcpu->kvm->mmu_lock);
2163} 2150}
2164 2151
2165static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 2152static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2153 u32 access, u32 *error)
2166{ 2154{
2155 if (error)
2156 *error = 0;
2167 return vaddr; 2157 return vaddr;
2168} 2158}
2169 2159
@@ -2747,7 +2737,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2747 if (tdp_enabled) 2737 if (tdp_enabled)
2748 return 0; 2738 return 0;
2749 2739
2750 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); 2740 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2751 2741
2752 spin_lock(&vcpu->kvm->mmu_lock); 2742 spin_lock(&vcpu->kvm->mmu_lock);
2753 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2743 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
@@ -2847,16 +2837,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2847 */ 2837 */
2848 page = alloc_page(GFP_KERNEL | __GFP_DMA32); 2838 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
2849 if (!page) 2839 if (!page)
2850 goto error_1; 2840 return -ENOMEM;
2841
2851 vcpu->arch.mmu.pae_root = page_address(page); 2842 vcpu->arch.mmu.pae_root = page_address(page);
2852 for (i = 0; i < 4; ++i) 2843 for (i = 0; i < 4; ++i)
2853 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 2844 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2854 2845
2855 return 0; 2846 return 0;
2856
2857error_1:
2858 free_mmu_pages(vcpu);
2859 return -ENOMEM;
2860} 2847}
2861 2848
2862int kvm_mmu_create(struct kvm_vcpu *vcpu) 2849int kvm_mmu_create(struct kvm_vcpu *vcpu)
@@ -2936,10 +2923,9 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2936 spin_lock(&kvm_lock); 2923 spin_lock(&kvm_lock);
2937 2924
2938 list_for_each_entry(kvm, &vm_list, vm_list) { 2925 list_for_each_entry(kvm, &vm_list, vm_list) {
2939 int npages; 2926 int npages, idx;
2940 2927
2941 if (!down_read_trylock(&kvm->slots_lock)) 2928 idx = srcu_read_lock(&kvm->srcu);
2942 continue;
2943 spin_lock(&kvm->mmu_lock); 2929 spin_lock(&kvm->mmu_lock);
2944 npages = kvm->arch.n_alloc_mmu_pages - 2930 npages = kvm->arch.n_alloc_mmu_pages -
2945 kvm->arch.n_free_mmu_pages; 2931 kvm->arch.n_free_mmu_pages;
@@ -2952,7 +2938,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2952 nr_to_scan--; 2938 nr_to_scan--;
2953 2939
2954 spin_unlock(&kvm->mmu_lock); 2940 spin_unlock(&kvm->mmu_lock);
2955 up_read(&kvm->slots_lock); 2941 srcu_read_unlock(&kvm->srcu, idx);
2956 } 2942 }
2957 if (kvm_freed) 2943 if (kvm_freed)
2958 list_move_tail(&kvm_freed->vm_list, &vm_list); 2944 list_move_tail(&kvm_freed->vm_list, &vm_list);
@@ -3019,9 +3005,11 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3019 int i; 3005 int i;
3020 unsigned int nr_mmu_pages; 3006 unsigned int nr_mmu_pages;
3021 unsigned int nr_pages = 0; 3007 unsigned int nr_pages = 0;
3008 struct kvm_memslots *slots;
3022 3009
3023 for (i = 0; i < kvm->nmemslots; i++) 3010 slots = rcu_dereference(kvm->memslots);
3024 nr_pages += kvm->memslots[i].npages; 3011 for (i = 0; i < slots->nmemslots; i++)
3012 nr_pages += slots->memslots[i].npages;
3025 3013
3026 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; 3014 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
3027 nr_mmu_pages = max(nr_mmu_pages, 3015 nr_mmu_pages = max(nr_mmu_pages,
@@ -3246,7 +3234,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3246 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) 3234 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3247 audit_mappings_page(vcpu, ent, va, level - 1); 3235 audit_mappings_page(vcpu, ent, va, level - 1);
3248 else { 3236 else {
3249 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); 3237 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
3250 gfn_t gfn = gpa >> PAGE_SHIFT; 3238 gfn_t gfn = gpa >> PAGE_SHIFT;
3251 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); 3239 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3252 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; 3240 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
@@ -3291,10 +3279,12 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
3291static int count_rmaps(struct kvm_vcpu *vcpu) 3279static int count_rmaps(struct kvm_vcpu *vcpu)
3292{ 3280{
3293 int nmaps = 0; 3281 int nmaps = 0;
3294 int i, j, k; 3282 int i, j, k, idx;
3295 3283
3284 idx = srcu_read_lock(&kvm->srcu);
3285 slots = rcu_dereference(kvm->memslots);
3296 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 3286 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3297 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; 3287 struct kvm_memory_slot *m = &slots->memslots[i];
3298 struct kvm_rmap_desc *d; 3288 struct kvm_rmap_desc *d;
3299 3289
3300 for (j = 0; j < m->npages; ++j) { 3290 for (j = 0; j < m->npages; ++j) {
@@ -3317,6 +3307,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3317 } 3307 }
3318 } 3308 }
3319 } 3309 }
3310 srcu_read_unlock(&kvm->srcu, idx);
3320 return nmaps; 3311 return nmaps;
3321} 3312}
3322 3313
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 61a1b3884b49..be66759321a5 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -2,6 +2,7 @@
2#define __KVM_X86_MMU_H 2#define __KVM_X86_MMU_H
3 3
4#include <linux/kvm_host.h> 4#include <linux/kvm_host.h>
5#include "kvm_cache_regs.h"
5 6
6#define PT64_PT_BITS 9 7#define PT64_PT_BITS 9
7#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) 8#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
@@ -37,6 +38,16 @@
37#define PT32_ROOT_LEVEL 2 38#define PT32_ROOT_LEVEL 2
38#define PT32E_ROOT_LEVEL 3 39#define PT32E_ROOT_LEVEL 3
39 40
41#define PT_PDPE_LEVEL 3
42#define PT_DIRECTORY_LEVEL 2
43#define PT_PAGE_TABLE_LEVEL 1
44
45#define PFERR_PRESENT_MASK (1U << 0)
46#define PFERR_WRITE_MASK (1U << 1)
47#define PFERR_USER_MASK (1U << 2)
48#define PFERR_RSVD_MASK (1U << 3)
49#define PFERR_FETCH_MASK (1U << 4)
50
40int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); 51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
41 52
42static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 53static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
@@ -53,30 +64,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
53 return kvm_mmu_load(vcpu); 64 return kvm_mmu_load(vcpu);
54} 65}
55 66
56static inline int is_long_mode(struct kvm_vcpu *vcpu)
57{
58#ifdef CONFIG_X86_64
59 return vcpu->arch.shadow_efer & EFER_LMA;
60#else
61 return 0;
62#endif
63}
64
65static inline int is_pae(struct kvm_vcpu *vcpu)
66{
67 return vcpu->arch.cr4 & X86_CR4_PAE;
68}
69
70static inline int is_pse(struct kvm_vcpu *vcpu)
71{
72 return vcpu->arch.cr4 & X86_CR4_PSE;
73}
74
75static inline int is_paging(struct kvm_vcpu *vcpu)
76{
77 return vcpu->arch.cr0 & X86_CR0_PG;
78}
79
80static inline int is_present_gpte(unsigned long pte) 67static inline int is_present_gpte(unsigned long pte)
81{ 68{
82 return pte & PT_PRESENT_MASK; 69 return pte & PT_PRESENT_MASK;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ede2131a9225..81eab9a50e6a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -162,7 +162,7 @@ walk:
162 if (rsvd_fault) 162 if (rsvd_fault)
163 goto access_error; 163 goto access_error;
164 164
165 if (write_fault && !is_writeble_pte(pte)) 165 if (write_fault && !is_writable_pte(pte))
166 if (user_fault || is_write_protection(vcpu)) 166 if (user_fault || is_write_protection(vcpu))
167 goto access_error; 167 goto access_error;
168 168
@@ -490,18 +490,23 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
490 spin_unlock(&vcpu->kvm->mmu_lock); 490 spin_unlock(&vcpu->kvm->mmu_lock);
491} 491}
492 492
493static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 493static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
494 u32 *error)
494{ 495{
495 struct guest_walker walker; 496 struct guest_walker walker;
496 gpa_t gpa = UNMAPPED_GVA; 497 gpa_t gpa = UNMAPPED_GVA;
497 int r; 498 int r;
498 499
499 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); 500 r = FNAME(walk_addr)(&walker, vcpu, vaddr,
501 !!(access & PFERR_WRITE_MASK),
502 !!(access & PFERR_USER_MASK),
503 !!(access & PFERR_FETCH_MASK));
500 504
501 if (r) { 505 if (r) {
502 gpa = gfn_to_gpa(walker.gfn); 506 gpa = gfn_to_gpa(walker.gfn);
503 gpa |= vaddr & ~PAGE_MASK; 507 gpa |= vaddr & ~PAGE_MASK;
504 } 508 } else if (error)
509 *error = walker.error_code;
505 510
506 return gpa; 511 return gpa;
507} 512}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1d9b33843c80..52f78dd03010 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -231,7 +231,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
231 efer &= ~EFER_LME; 231 efer &= ~EFER_LME;
232 232
233 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 233 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
234 vcpu->arch.shadow_efer = efer; 234 vcpu->arch.efer = efer;
235} 235}
236 236
237static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 237static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -540,6 +540,8 @@ static void init_vmcb(struct vcpu_svm *svm)
540 struct vmcb_control_area *control = &svm->vmcb->control; 540 struct vmcb_control_area *control = &svm->vmcb->control;
541 struct vmcb_save_area *save = &svm->vmcb->save; 541 struct vmcb_save_area *save = &svm->vmcb->save;
542 542
543 svm->vcpu.fpu_active = 1;
544
543 control->intercept_cr_read = INTERCEPT_CR0_MASK | 545 control->intercept_cr_read = INTERCEPT_CR0_MASK |
544 INTERCEPT_CR3_MASK | 546 INTERCEPT_CR3_MASK |
545 INTERCEPT_CR4_MASK; 547 INTERCEPT_CR4_MASK;
@@ -552,13 +554,19 @@ static void init_vmcb(struct vcpu_svm *svm)
552 control->intercept_dr_read = INTERCEPT_DR0_MASK | 554 control->intercept_dr_read = INTERCEPT_DR0_MASK |
553 INTERCEPT_DR1_MASK | 555 INTERCEPT_DR1_MASK |
554 INTERCEPT_DR2_MASK | 556 INTERCEPT_DR2_MASK |
555 INTERCEPT_DR3_MASK; 557 INTERCEPT_DR3_MASK |
558 INTERCEPT_DR4_MASK |
559 INTERCEPT_DR5_MASK |
560 INTERCEPT_DR6_MASK |
561 INTERCEPT_DR7_MASK;
556 562
557 control->intercept_dr_write = INTERCEPT_DR0_MASK | 563 control->intercept_dr_write = INTERCEPT_DR0_MASK |
558 INTERCEPT_DR1_MASK | 564 INTERCEPT_DR1_MASK |
559 INTERCEPT_DR2_MASK | 565 INTERCEPT_DR2_MASK |
560 INTERCEPT_DR3_MASK | 566 INTERCEPT_DR3_MASK |
567 INTERCEPT_DR4_MASK |
561 INTERCEPT_DR5_MASK | 568 INTERCEPT_DR5_MASK |
569 INTERCEPT_DR6_MASK |
562 INTERCEPT_DR7_MASK; 570 INTERCEPT_DR7_MASK;
563 571
564 control->intercept_exceptions = (1 << PF_VECTOR) | 572 control->intercept_exceptions = (1 << PF_VECTOR) |
@@ -569,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm)
569 control->intercept = (1ULL << INTERCEPT_INTR) | 577 control->intercept = (1ULL << INTERCEPT_INTR) |
570 (1ULL << INTERCEPT_NMI) | 578 (1ULL << INTERCEPT_NMI) |
571 (1ULL << INTERCEPT_SMI) | 579 (1ULL << INTERCEPT_SMI) |
580 (1ULL << INTERCEPT_SELECTIVE_CR0) |
572 (1ULL << INTERCEPT_CPUID) | 581 (1ULL << INTERCEPT_CPUID) |
573 (1ULL << INTERCEPT_INVD) | 582 (1ULL << INTERCEPT_INVD) |
574 (1ULL << INTERCEPT_HLT) | 583 (1ULL << INTERCEPT_HLT) |
@@ -641,10 +650,8 @@ static void init_vmcb(struct vcpu_svm *svm)
641 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | 650 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
642 (1ULL << INTERCEPT_INVLPG)); 651 (1ULL << INTERCEPT_INVLPG));
643 control->intercept_exceptions &= ~(1 << PF_VECTOR); 652 control->intercept_exceptions &= ~(1 << PF_VECTOR);
644 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| 653 control->intercept_cr_read &= ~INTERCEPT_CR3_MASK;
645 INTERCEPT_CR3_MASK); 654 control->intercept_cr_write &= ~INTERCEPT_CR3_MASK;
646 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
647 INTERCEPT_CR3_MASK);
648 save->g_pat = 0x0007040600070406ULL; 655 save->g_pat = 0x0007040600070406ULL;
649 save->cr3 = 0; 656 save->cr3 = 0;
650 save->cr4 = 0; 657 save->cr4 = 0;
@@ -730,7 +737,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
730 init_vmcb(svm); 737 init_vmcb(svm);
731 738
732 fx_init(&svm->vcpu); 739 fx_init(&svm->vcpu);
733 svm->vcpu.fpu_active = 1;
734 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 740 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
735 if (kvm_vcpu_is_bsp(&svm->vcpu)) 741 if (kvm_vcpu_is_bsp(&svm->vcpu))
736 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 742 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
@@ -765,14 +771,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
765 if (unlikely(cpu != vcpu->cpu)) { 771 if (unlikely(cpu != vcpu->cpu)) {
766 u64 delta; 772 u64 delta;
767 773
768 /* 774 if (check_tsc_unstable()) {
769 * Make sure that the guest sees a monotonically 775 /*
770 * increasing TSC. 776 * Make sure that the guest sees a monotonically
771 */ 777 * increasing TSC.
772 delta = vcpu->arch.host_tsc - native_read_tsc(); 778 */
773 svm->vmcb->control.tsc_offset += delta; 779 delta = vcpu->arch.host_tsc - native_read_tsc();
774 if (is_nested(svm)) 780 svm->vmcb->control.tsc_offset += delta;
775 svm->nested.hsave->control.tsc_offset += delta; 781 if (is_nested(svm))
782 svm->nested.hsave->control.tsc_offset += delta;
783 }
776 vcpu->cpu = cpu; 784 vcpu->cpu = cpu;
777 kvm_migrate_timers(vcpu); 785 kvm_migrate_timers(vcpu);
778 svm->asid_generation = 0; 786 svm->asid_generation = 0;
@@ -954,42 +962,59 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
954 svm->vmcb->save.gdtr.base = dt->base ; 962 svm->vmcb->save.gdtr.base = dt->base ;
955} 963}
956 964
965static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
966{
967}
968
957static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 969static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
958{ 970{
959} 971}
960 972
973static void update_cr0_intercept(struct vcpu_svm *svm)
974{
975 ulong gcr0 = svm->vcpu.arch.cr0;
976 u64 *hcr0 = &svm->vmcb->save.cr0;
977
978 if (!svm->vcpu.fpu_active)
979 *hcr0 |= SVM_CR0_SELECTIVE_MASK;
980 else
981 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
982 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
983
984
985 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
986 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
987 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
988 } else {
989 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
990 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
991 }
992}
993
961static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 994static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
962{ 995{
963 struct vcpu_svm *svm = to_svm(vcpu); 996 struct vcpu_svm *svm = to_svm(vcpu);
964 997
965#ifdef CONFIG_X86_64 998#ifdef CONFIG_X86_64
966 if (vcpu->arch.shadow_efer & EFER_LME) { 999 if (vcpu->arch.efer & EFER_LME) {
967 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1000 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
968 vcpu->arch.shadow_efer |= EFER_LMA; 1001 vcpu->arch.efer |= EFER_LMA;
969 svm->vmcb->save.efer |= EFER_LMA | EFER_LME; 1002 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
970 } 1003 }
971 1004
972 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { 1005 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
973 vcpu->arch.shadow_efer &= ~EFER_LMA; 1006 vcpu->arch.efer &= ~EFER_LMA;
974 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); 1007 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
975 } 1008 }
976 } 1009 }
977#endif 1010#endif
978 if (npt_enabled) 1011 vcpu->arch.cr0 = cr0;
979 goto set;
980 1012
981 if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { 1013 if (!npt_enabled)
982 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1014 cr0 |= X86_CR0_PG | X86_CR0_WP;
983 vcpu->fpu_active = 1;
984 }
985 1015
986 vcpu->arch.cr0 = cr0; 1016 if (!vcpu->fpu_active)
987 cr0 |= X86_CR0_PG | X86_CR0_WP;
988 if (!vcpu->fpu_active) {
989 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
990 cr0 |= X86_CR0_TS; 1017 cr0 |= X86_CR0_TS;
991 }
992set:
993 /* 1018 /*
994 * re-enable caching here because the QEMU bios 1019 * re-enable caching here because the QEMU bios
995 * does not do it - this results in some delay at 1020 * does not do it - this results in some delay at
@@ -997,6 +1022,7 @@ set:
997 */ 1022 */
998 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1023 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
999 svm->vmcb->save.cr0 = cr0; 1024 svm->vmcb->save.cr0 = cr0;
1025 update_cr0_intercept(svm);
1000} 1026}
1001 1027
1002static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1028static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1102,76 +1128,70 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1102 svm->vmcb->control.asid = sd->next_asid++; 1128 svm->vmcb->control.asid = sd->next_asid++;
1103} 1129}
1104 1130
1105static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) 1131static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest)
1106{ 1132{
1107 struct vcpu_svm *svm = to_svm(vcpu); 1133 struct vcpu_svm *svm = to_svm(vcpu);
1108 unsigned long val;
1109 1134
1110 switch (dr) { 1135 switch (dr) {
1111 case 0 ... 3: 1136 case 0 ... 3:
1112 val = vcpu->arch.db[dr]; 1137 *dest = vcpu->arch.db[dr];
1113 break; 1138 break;
1139 case 4:
1140 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1141 return EMULATE_FAIL; /* will re-inject UD */
1142 /* fall through */
1114 case 6: 1143 case 6:
1115 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1144 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1116 val = vcpu->arch.dr6; 1145 *dest = vcpu->arch.dr6;
1117 else 1146 else
1118 val = svm->vmcb->save.dr6; 1147 *dest = svm->vmcb->save.dr6;
1119 break; 1148 break;
1149 case 5:
1150 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1151 return EMULATE_FAIL; /* will re-inject UD */
1152 /* fall through */
1120 case 7: 1153 case 7:
1121 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1154 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1122 val = vcpu->arch.dr7; 1155 *dest = vcpu->arch.dr7;
1123 else 1156 else
1124 val = svm->vmcb->save.dr7; 1157 *dest = svm->vmcb->save.dr7;
1125 break; 1158 break;
1126 default:
1127 val = 0;
1128 } 1159 }
1129 1160
1130 return val; 1161 return EMULATE_DONE;
1131} 1162}
1132 1163
1133static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, 1164static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
1134 int *exception)
1135{ 1165{
1136 struct vcpu_svm *svm = to_svm(vcpu); 1166 struct vcpu_svm *svm = to_svm(vcpu);
1137 1167
1138 *exception = 0;
1139
1140 switch (dr) { 1168 switch (dr) {
1141 case 0 ... 3: 1169 case 0 ... 3:
1142 vcpu->arch.db[dr] = value; 1170 vcpu->arch.db[dr] = value;
1143 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 1171 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1144 vcpu->arch.eff_db[dr] = value; 1172 vcpu->arch.eff_db[dr] = value;
1145 return; 1173 break;
1146 case 4 ... 5: 1174 case 4:
1147 if (vcpu->arch.cr4 & X86_CR4_DE) 1175 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1148 *exception = UD_VECTOR; 1176 return EMULATE_FAIL; /* will re-inject UD */
1149 return; 1177 /* fall through */
1150 case 6: 1178 case 6:
1151 if (value & 0xffffffff00000000ULL) {
1152 *exception = GP_VECTOR;
1153 return;
1154 }
1155 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; 1179 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
1156 return; 1180 break;
1181 case 5:
1182 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1183 return EMULATE_FAIL; /* will re-inject UD */
1184 /* fall through */
1157 case 7: 1185 case 7:
1158 if (value & 0xffffffff00000000ULL) {
1159 *exception = GP_VECTOR;
1160 return;
1161 }
1162 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; 1186 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
1163 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 1187 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1164 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1188 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1165 vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); 1189 vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
1166 } 1190 }
1167 return; 1191 break;
1168 default:
1169 /* FIXME: Possible case? */
1170 printk(KERN_DEBUG "%s: unexpected dr %u\n",
1171 __func__, dr);
1172 *exception = UD_VECTOR;
1173 return;
1174 } 1192 }
1193
1194 return EMULATE_DONE;
1175} 1195}
1176 1196
1177static int pf_interception(struct vcpu_svm *svm) 1197static int pf_interception(struct vcpu_svm *svm)
@@ -1239,13 +1259,17 @@ static int ud_interception(struct vcpu_svm *svm)
1239 return 1; 1259 return 1;
1240} 1260}
1241 1261
1242static int nm_interception(struct vcpu_svm *svm) 1262static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1243{ 1263{
1264 struct vcpu_svm *svm = to_svm(vcpu);
1244 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1265 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
1245 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
1246 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
1247 svm->vcpu.fpu_active = 1; 1266 svm->vcpu.fpu_active = 1;
1267 update_cr0_intercept(svm);
1268}
1248 1269
1270static int nm_interception(struct vcpu_svm *svm)
1271{
1272 svm_fpu_activate(&svm->vcpu);
1249 return 1; 1273 return 1;
1250} 1274}
1251 1275
@@ -1337,7 +1361,7 @@ static int vmmcall_interception(struct vcpu_svm *svm)
1337 1361
1338static int nested_svm_check_permissions(struct vcpu_svm *svm) 1362static int nested_svm_check_permissions(struct vcpu_svm *svm)
1339{ 1363{
1340 if (!(svm->vcpu.arch.shadow_efer & EFER_SVME) 1364 if (!(svm->vcpu.arch.efer & EFER_SVME)
1341 || !is_paging(&svm->vcpu)) { 1365 || !is_paging(&svm->vcpu)) {
1342 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1366 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1343 return 1; 1367 return 1;
@@ -1740,8 +1764,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1740 hsave->save.ds = vmcb->save.ds; 1764 hsave->save.ds = vmcb->save.ds;
1741 hsave->save.gdtr = vmcb->save.gdtr; 1765 hsave->save.gdtr = vmcb->save.gdtr;
1742 hsave->save.idtr = vmcb->save.idtr; 1766 hsave->save.idtr = vmcb->save.idtr;
1743 hsave->save.efer = svm->vcpu.arch.shadow_efer; 1767 hsave->save.efer = svm->vcpu.arch.efer;
1744 hsave->save.cr0 = svm->vcpu.arch.cr0; 1768 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
1745 hsave->save.cr4 = svm->vcpu.arch.cr4; 1769 hsave->save.cr4 = svm->vcpu.arch.cr4;
1746 hsave->save.rflags = vmcb->save.rflags; 1770 hsave->save.rflags = vmcb->save.rflags;
1747 hsave->save.rip = svm->next_rip; 1771 hsave->save.rip = svm->next_rip;
@@ -2153,9 +2177,10 @@ static int rdmsr_interception(struct vcpu_svm *svm)
2153 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2177 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2154 u64 data; 2178 u64 data;
2155 2179
2156 if (svm_get_msr(&svm->vcpu, ecx, &data)) 2180 if (svm_get_msr(&svm->vcpu, ecx, &data)) {
2181 trace_kvm_msr_read_ex(ecx);
2157 kvm_inject_gp(&svm->vcpu, 0); 2182 kvm_inject_gp(&svm->vcpu, 0);
2158 else { 2183 } else {
2159 trace_kvm_msr_read(ecx, data); 2184 trace_kvm_msr_read(ecx, data);
2160 2185
2161 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; 2186 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
@@ -2247,13 +2272,15 @@ static int wrmsr_interception(struct vcpu_svm *svm)
2247 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 2272 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
2248 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 2273 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2249 2274
2250 trace_kvm_msr_write(ecx, data);
2251 2275
2252 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 2276 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2253 if (svm_set_msr(&svm->vcpu, ecx, data)) 2277 if (svm_set_msr(&svm->vcpu, ecx, data)) {
2278 trace_kvm_msr_write_ex(ecx, data);
2254 kvm_inject_gp(&svm->vcpu, 0); 2279 kvm_inject_gp(&svm->vcpu, 0);
2255 else 2280 } else {
2281 trace_kvm_msr_write(ecx, data);
2256 skip_emulated_instruction(&svm->vcpu); 2282 skip_emulated_instruction(&svm->vcpu);
2283 }
2257 return 1; 2284 return 1;
2258} 2285}
2259 2286
@@ -2297,7 +2324,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2297 [SVM_EXIT_READ_CR3] = emulate_on_interception, 2324 [SVM_EXIT_READ_CR3] = emulate_on_interception,
2298 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2325 [SVM_EXIT_READ_CR4] = emulate_on_interception,
2299 [SVM_EXIT_READ_CR8] = emulate_on_interception, 2326 [SVM_EXIT_READ_CR8] = emulate_on_interception,
2300 /* for now: */ 2327 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2301 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 2328 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
2302 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 2329 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
2303 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 2330 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
@@ -2306,11 +2333,17 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2306 [SVM_EXIT_READ_DR1] = emulate_on_interception, 2333 [SVM_EXIT_READ_DR1] = emulate_on_interception,
2307 [SVM_EXIT_READ_DR2] = emulate_on_interception, 2334 [SVM_EXIT_READ_DR2] = emulate_on_interception,
2308 [SVM_EXIT_READ_DR3] = emulate_on_interception, 2335 [SVM_EXIT_READ_DR3] = emulate_on_interception,
2336 [SVM_EXIT_READ_DR4] = emulate_on_interception,
2337 [SVM_EXIT_READ_DR5] = emulate_on_interception,
2338 [SVM_EXIT_READ_DR6] = emulate_on_interception,
2339 [SVM_EXIT_READ_DR7] = emulate_on_interception,
2309 [SVM_EXIT_WRITE_DR0] = emulate_on_interception, 2340 [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
2310 [SVM_EXIT_WRITE_DR1] = emulate_on_interception, 2341 [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
2311 [SVM_EXIT_WRITE_DR2] = emulate_on_interception, 2342 [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
2312 [SVM_EXIT_WRITE_DR3] = emulate_on_interception, 2343 [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
2344 [SVM_EXIT_WRITE_DR4] = emulate_on_interception,
2313 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 2345 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
2346 [SVM_EXIT_WRITE_DR6] = emulate_on_interception,
2314 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 2347 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
2315 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 2348 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2316 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 2349 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
@@ -2383,20 +2416,10 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2383 2416
2384 svm_complete_interrupts(svm); 2417 svm_complete_interrupts(svm);
2385 2418
2386 if (npt_enabled) { 2419 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
2387 int mmu_reload = 0;
2388 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
2389 svm_set_cr0(vcpu, svm->vmcb->save.cr0);
2390 mmu_reload = 1;
2391 }
2392 vcpu->arch.cr0 = svm->vmcb->save.cr0; 2420 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2421 if (npt_enabled)
2393 vcpu->arch.cr3 = svm->vmcb->save.cr3; 2422 vcpu->arch.cr3 = svm->vmcb->save.cr3;
2394 if (mmu_reload) {
2395 kvm_mmu_reset_context(vcpu);
2396 kvm_mmu_load(vcpu);
2397 }
2398 }
2399
2400 2423
2401 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 2424 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
2402 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2425 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2798,12 +2821,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
2798 2821
2799 svm->vmcb->save.cr3 = root; 2822 svm->vmcb->save.cr3 = root;
2800 force_new_asid(vcpu); 2823 force_new_asid(vcpu);
2801
2802 if (vcpu->fpu_active) {
2803 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
2804 svm->vmcb->save.cr0 |= X86_CR0_TS;
2805 vcpu->fpu_active = 0;
2806 }
2807} 2824}
2808 2825
2809static int is_disabled(void) 2826static int is_disabled(void)
@@ -2852,6 +2869,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
2852 return 0; 2869 return 0;
2853} 2870}
2854 2871
2872static void svm_cpuid_update(struct kvm_vcpu *vcpu)
2873{
2874}
2875
2855static const struct trace_print_flags svm_exit_reasons_str[] = { 2876static const struct trace_print_flags svm_exit_reasons_str[] = {
2856 { SVM_EXIT_READ_CR0, "read_cr0" }, 2877 { SVM_EXIT_READ_CR0, "read_cr0" },
2857 { SVM_EXIT_READ_CR3, "read_cr3" }, 2878 { SVM_EXIT_READ_CR3, "read_cr3" },
@@ -2905,9 +2926,22 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
2905 { -1, NULL } 2926 { -1, NULL }
2906}; 2927};
2907 2928
2908static bool svm_gb_page_enable(void) 2929static int svm_get_lpage_level(void)
2909{ 2930{
2910 return true; 2931 return PT_PDPE_LEVEL;
2932}
2933
2934static bool svm_rdtscp_supported(void)
2935{
2936 return false;
2937}
2938
2939static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
2940{
2941 struct vcpu_svm *svm = to_svm(vcpu);
2942
2943 update_cr0_intercept(svm);
2944 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
2911} 2945}
2912 2946
2913static struct kvm_x86_ops svm_x86_ops = { 2947static struct kvm_x86_ops svm_x86_ops = {
@@ -2936,6 +2970,7 @@ static struct kvm_x86_ops svm_x86_ops = {
2936 .set_segment = svm_set_segment, 2970 .set_segment = svm_set_segment,
2937 .get_cpl = svm_get_cpl, 2971 .get_cpl = svm_get_cpl,
2938 .get_cs_db_l_bits = kvm_get_cs_db_l_bits, 2972 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
2973 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
2939 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, 2974 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
2940 .set_cr0 = svm_set_cr0, 2975 .set_cr0 = svm_set_cr0,
2941 .set_cr3 = svm_set_cr3, 2976 .set_cr3 = svm_set_cr3,
@@ -2950,6 +2985,8 @@ static struct kvm_x86_ops svm_x86_ops = {
2950 .cache_reg = svm_cache_reg, 2985 .cache_reg = svm_cache_reg,
2951 .get_rflags = svm_get_rflags, 2986 .get_rflags = svm_get_rflags,
2952 .set_rflags = svm_set_rflags, 2987 .set_rflags = svm_set_rflags,
2988 .fpu_activate = svm_fpu_activate,
2989 .fpu_deactivate = svm_fpu_deactivate,
2953 2990
2954 .tlb_flush = svm_flush_tlb, 2991 .tlb_flush = svm_flush_tlb,
2955 2992
@@ -2975,7 +3012,11 @@ static struct kvm_x86_ops svm_x86_ops = {
2975 .get_mt_mask = svm_get_mt_mask, 3012 .get_mt_mask = svm_get_mt_mask,
2976 3013
2977 .exit_reasons_str = svm_exit_reasons_str, 3014 .exit_reasons_str = svm_exit_reasons_str,
2978 .gb_page_enable = svm_gb_page_enable, 3015 .get_lpage_level = svm_get_lpage_level,
3016
3017 .cpuid_update = svm_cpuid_update,
3018
3019 .rdtscp_supported = svm_rdtscp_supported,
2979}; 3020};
2980 3021
2981static int __init svm_init(void) 3022static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 816e0449db0b..6ad30a29f044 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -56,6 +56,38 @@ TRACE_EVENT(kvm_hypercall,
56); 56);
57 57
58/* 58/*
59 * Tracepoint for hypercall.
60 */
61TRACE_EVENT(kvm_hv_hypercall,
62 TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx,
63 __u64 ingpa, __u64 outgpa),
64 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
65
66 TP_STRUCT__entry(
67 __field( __u16, code )
68 __field( bool, fast )
69 __field( __u16, rep_cnt )
70 __field( __u16, rep_idx )
71 __field( __u64, ingpa )
72 __field( __u64, outgpa )
73 ),
74
75 TP_fast_assign(
76 __entry->code = code;
77 __entry->fast = fast;
78 __entry->rep_cnt = rep_cnt;
79 __entry->rep_idx = rep_idx;
80 __entry->ingpa = ingpa;
81 __entry->outgpa = outgpa;
82 ),
83
84 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
85 __entry->code, __entry->fast ? "fast" : "slow",
86 __entry->rep_cnt, __entry->rep_idx, __entry->ingpa,
87 __entry->outgpa)
88);
89
90/*
59 * Tracepoint for PIO. 91 * Tracepoint for PIO.
60 */ 92 */
61TRACE_EVENT(kvm_pio, 93TRACE_EVENT(kvm_pio,
@@ -214,28 +246,33 @@ TRACE_EVENT(kvm_page_fault,
214 * Tracepoint for guest MSR access. 246 * Tracepoint for guest MSR access.
215 */ 247 */
216TRACE_EVENT(kvm_msr, 248TRACE_EVENT(kvm_msr,
217 TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), 249 TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception),
218 TP_ARGS(rw, ecx, data), 250 TP_ARGS(write, ecx, data, exception),
219 251
220 TP_STRUCT__entry( 252 TP_STRUCT__entry(
221 __field( unsigned int, rw ) 253 __field( unsigned, write )
222 __field( unsigned int, ecx ) 254 __field( u32, ecx )
223 __field( unsigned long, data ) 255 __field( u64, data )
256 __field( u8, exception )
224 ), 257 ),
225 258
226 TP_fast_assign( 259 TP_fast_assign(
227 __entry->rw = rw; 260 __entry->write = write;
228 __entry->ecx = ecx; 261 __entry->ecx = ecx;
229 __entry->data = data; 262 __entry->data = data;
263 __entry->exception = exception;
230 ), 264 ),
231 265
232 TP_printk("msr_%s %x = 0x%lx", 266 TP_printk("msr_%s %x = 0x%llx%s",
233 __entry->rw ? "write" : "read", 267 __entry->write ? "write" : "read",
234 __entry->ecx, __entry->data) 268 __entry->ecx, __entry->data,
269 __entry->exception ? " (#GP)" : "")
235); 270);
236 271
237#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) 272#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false)
238#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) 273#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false)
274#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true)
275#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true)
239 276
240/* 277/*
241 * Tracepoint for guest CR access. 278 * Tracepoint for guest CR access.
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d4918d6fc924..14873b9f8430 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,6 +61,21 @@ module_param_named(unrestricted_guest,
61static int __read_mostly emulate_invalid_guest_state = 0; 61static int __read_mostly emulate_invalid_guest_state = 0;
62module_param(emulate_invalid_guest_state, bool, S_IRUGO); 62module_param(emulate_invalid_guest_state, bool, S_IRUGO);
63 63
64#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
65 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
66#define KVM_GUEST_CR0_MASK \
67 (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
68#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
69 (X86_CR0_WP | X86_CR0_NE)
70#define KVM_VM_CR0_ALWAYS_ON \
71 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
72#define KVM_CR4_GUEST_OWNED_BITS \
73 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
74 | X86_CR4_OSXMMEXCPT)
75
76#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
77#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
78
64/* 79/*
65 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 80 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
66 * ple_gap: upper bound on the amount of time between two successive 81 * ple_gap: upper bound on the amount of time between two successive
@@ -136,6 +151,8 @@ struct vcpu_vmx {
136 ktime_t entry_time; 151 ktime_t entry_time;
137 s64 vnmi_blocked_time; 152 s64 vnmi_blocked_time;
138 u32 exit_reason; 153 u32 exit_reason;
154
155 bool rdtscp_enabled;
139}; 156};
140 157
141static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 158static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -210,7 +227,7 @@ static const u32 vmx_msr_index[] = {
210#ifdef CONFIG_X86_64 227#ifdef CONFIG_X86_64
211 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 228 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
212#endif 229#endif
213 MSR_EFER, MSR_K6_STAR, 230 MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,
214}; 231};
215#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 232#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
216 233
@@ -301,6 +318,11 @@ static inline bool cpu_has_vmx_ept_2m_page(void)
301 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); 318 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
302} 319}
303 320
321static inline bool cpu_has_vmx_ept_1g_page(void)
322{
323 return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
324}
325
304static inline int cpu_has_vmx_invept_individual_addr(void) 326static inline int cpu_has_vmx_invept_individual_addr(void)
305{ 327{
306 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); 328 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -336,9 +358,7 @@ static inline int cpu_has_vmx_ple(void)
336 358
337static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 359static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
338{ 360{
339 return flexpriority_enabled && 361 return flexpriority_enabled && irqchip_in_kernel(kvm);
340 (cpu_has_vmx_virtualize_apic_accesses()) &&
341 (irqchip_in_kernel(kvm));
342} 362}
343 363
344static inline int cpu_has_vmx_vpid(void) 364static inline int cpu_has_vmx_vpid(void)
@@ -347,6 +367,12 @@ static inline int cpu_has_vmx_vpid(void)
347 SECONDARY_EXEC_ENABLE_VPID; 367 SECONDARY_EXEC_ENABLE_VPID;
348} 368}
349 369
370static inline int cpu_has_vmx_rdtscp(void)
371{
372 return vmcs_config.cpu_based_2nd_exec_ctrl &
373 SECONDARY_EXEC_RDTSCP;
374}
375
350static inline int cpu_has_virtual_nmis(void) 376static inline int cpu_has_virtual_nmis(void)
351{ 377{
352 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 378 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -551,22 +577,18 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
551{ 577{
552 u32 eb; 578 u32 eb;
553 579
554 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); 580 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
555 if (!vcpu->fpu_active) 581 (1u << NM_VECTOR) | (1u << DB_VECTOR);
556 eb |= 1u << NM_VECTOR; 582 if ((vcpu->guest_debug &
557 /* 583 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
558 * Unconditionally intercept #DB so we can maintain dr6 without 584 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
559 * reading it every exit. 585 eb |= 1u << BP_VECTOR;
560 */
561 eb |= 1u << DB_VECTOR;
562 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
563 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
564 eb |= 1u << BP_VECTOR;
565 }
566 if (to_vmx(vcpu)->rmode.vm86_active) 586 if (to_vmx(vcpu)->rmode.vm86_active)
567 eb = ~0; 587 eb = ~0;
568 if (enable_ept) 588 if (enable_ept)
569 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 589 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
590 if (vcpu->fpu_active)
591 eb &= ~(1u << NM_VECTOR);
570 vmcs_write32(EXCEPTION_BITMAP, eb); 592 vmcs_write32(EXCEPTION_BITMAP, eb);
571} 593}
572 594
@@ -589,7 +611,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
589 u64 guest_efer; 611 u64 guest_efer;
590 u64 ignore_bits; 612 u64 ignore_bits;
591 613
592 guest_efer = vmx->vcpu.arch.shadow_efer; 614 guest_efer = vmx->vcpu.arch.efer;
593 615
594 /* 616 /*
595 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 617 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -767,22 +789,30 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
767 789
768static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 790static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
769{ 791{
792 ulong cr0;
793
770 if (vcpu->fpu_active) 794 if (vcpu->fpu_active)
771 return; 795 return;
772 vcpu->fpu_active = 1; 796 vcpu->fpu_active = 1;
773 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); 797 cr0 = vmcs_readl(GUEST_CR0);
774 if (vcpu->arch.cr0 & X86_CR0_TS) 798 cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
775 vmcs_set_bits(GUEST_CR0, X86_CR0_TS); 799 cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
800 vmcs_writel(GUEST_CR0, cr0);
776 update_exception_bitmap(vcpu); 801 update_exception_bitmap(vcpu);
802 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
803 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
777} 804}
778 805
806static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
807
779static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) 808static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
780{ 809{
781 if (!vcpu->fpu_active) 810 vmx_decache_cr0_guest_bits(vcpu);
782 return; 811 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
783 vcpu->fpu_active = 0;
784 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
785 update_exception_bitmap(vcpu); 812 update_exception_bitmap(vcpu);
813 vcpu->arch.cr0_guest_owned_bits = 0;
814 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
815 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
786} 816}
787 817
788static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 818static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -878,6 +908,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
878 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 908 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
879} 909}
880 910
911static bool vmx_rdtscp_supported(void)
912{
913 return cpu_has_vmx_rdtscp();
914}
915
881/* 916/*
882 * Swap MSR entry in host/guest MSR entry array. 917 * Swap MSR entry in host/guest MSR entry array.
883 */ 918 */
@@ -913,12 +948,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
913 index = __find_msr_index(vmx, MSR_CSTAR); 948 index = __find_msr_index(vmx, MSR_CSTAR);
914 if (index >= 0) 949 if (index >= 0)
915 move_msr_up(vmx, index, save_nmsrs++); 950 move_msr_up(vmx, index, save_nmsrs++);
951 index = __find_msr_index(vmx, MSR_TSC_AUX);
952 if (index >= 0 && vmx->rdtscp_enabled)
953 move_msr_up(vmx, index, save_nmsrs++);
916 /* 954 /*
917 * MSR_K6_STAR is only needed on long mode guests, and only 955 * MSR_K6_STAR is only needed on long mode guests, and only
918 * if efer.sce is enabled. 956 * if efer.sce is enabled.
919 */ 957 */
920 index = __find_msr_index(vmx, MSR_K6_STAR); 958 index = __find_msr_index(vmx, MSR_K6_STAR);
921 if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) 959 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
922 move_msr_up(vmx, index, save_nmsrs++); 960 move_msr_up(vmx, index, save_nmsrs++);
923 } 961 }
924#endif 962#endif
@@ -1002,6 +1040,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1002 case MSR_IA32_SYSENTER_ESP: 1040 case MSR_IA32_SYSENTER_ESP:
1003 data = vmcs_readl(GUEST_SYSENTER_ESP); 1041 data = vmcs_readl(GUEST_SYSENTER_ESP);
1004 break; 1042 break;
1043 case MSR_TSC_AUX:
1044 if (!to_vmx(vcpu)->rdtscp_enabled)
1045 return 1;
1046 /* Otherwise falls through */
1005 default: 1047 default:
1006 vmx_load_host_state(to_vmx(vcpu)); 1048 vmx_load_host_state(to_vmx(vcpu));
1007 msr = find_msr_entry(to_vmx(vcpu), msr_index); 1049 msr = find_msr_entry(to_vmx(vcpu), msr_index);
@@ -1065,7 +1107,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1065 vcpu->arch.pat = data; 1107 vcpu->arch.pat = data;
1066 break; 1108 break;
1067 } 1109 }
1068 /* Otherwise falls through to kvm_set_msr_common */ 1110 ret = kvm_set_msr_common(vcpu, msr_index, data);
1111 break;
1112 case MSR_TSC_AUX:
1113 if (!vmx->rdtscp_enabled)
1114 return 1;
1115 /* Check reserved bit, higher 32 bits should be zero */
1116 if ((data >> 32) != 0)
1117 return 1;
1118 /* Otherwise falls through */
1069 default: 1119 default:
1070 msr = find_msr_entry(vmx, msr_index); 1120 msr = find_msr_entry(vmx, msr_index);
1071 if (msr) { 1121 if (msr) {
@@ -1224,6 +1274,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1224 CPU_BASED_USE_IO_BITMAPS | 1274 CPU_BASED_USE_IO_BITMAPS |
1225 CPU_BASED_MOV_DR_EXITING | 1275 CPU_BASED_MOV_DR_EXITING |
1226 CPU_BASED_USE_TSC_OFFSETING | 1276 CPU_BASED_USE_TSC_OFFSETING |
1277 CPU_BASED_MWAIT_EXITING |
1278 CPU_BASED_MONITOR_EXITING |
1227 CPU_BASED_INVLPG_EXITING; 1279 CPU_BASED_INVLPG_EXITING;
1228 opt = CPU_BASED_TPR_SHADOW | 1280 opt = CPU_BASED_TPR_SHADOW |
1229 CPU_BASED_USE_MSR_BITMAPS | 1281 CPU_BASED_USE_MSR_BITMAPS |
@@ -1243,7 +1295,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1243 SECONDARY_EXEC_ENABLE_VPID | 1295 SECONDARY_EXEC_ENABLE_VPID |
1244 SECONDARY_EXEC_ENABLE_EPT | 1296 SECONDARY_EXEC_ENABLE_EPT |
1245 SECONDARY_EXEC_UNRESTRICTED_GUEST | 1297 SECONDARY_EXEC_UNRESTRICTED_GUEST |
1246 SECONDARY_EXEC_PAUSE_LOOP_EXITING; 1298 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
1299 SECONDARY_EXEC_RDTSCP;
1247 if (adjust_vmx_controls(min2, opt2, 1300 if (adjust_vmx_controls(min2, opt2,
1248 MSR_IA32_VMX_PROCBASED_CTLS2, 1301 MSR_IA32_VMX_PROCBASED_CTLS2,
1249 &_cpu_based_2nd_exec_control) < 0) 1302 &_cpu_based_2nd_exec_control) < 0)
@@ -1457,8 +1510,12 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1457static gva_t rmode_tss_base(struct kvm *kvm) 1510static gva_t rmode_tss_base(struct kvm *kvm)
1458{ 1511{
1459 if (!kvm->arch.tss_addr) { 1512 if (!kvm->arch.tss_addr) {
1460 gfn_t base_gfn = kvm->memslots[0].base_gfn + 1513 struct kvm_memslots *slots;
1461 kvm->memslots[0].npages - 3; 1514 gfn_t base_gfn;
1515
1516 slots = rcu_dereference(kvm->memslots);
1517 base_gfn = kvm->memslots->memslots[0].base_gfn +
1518 kvm->memslots->memslots[0].npages - 3;
1462 return base_gfn << PAGE_SHIFT; 1519 return base_gfn << PAGE_SHIFT;
1463 } 1520 }
1464 return kvm->arch.tss_addr; 1521 return kvm->arch.tss_addr;
@@ -1544,9 +1601,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1544 * of this msr depends on is_long_mode(). 1601 * of this msr depends on is_long_mode().
1545 */ 1602 */
1546 vmx_load_host_state(to_vmx(vcpu)); 1603 vmx_load_host_state(to_vmx(vcpu));
1547 vcpu->arch.shadow_efer = efer; 1604 vcpu->arch.efer = efer;
1548 if (!msr)
1549 return;
1550 if (efer & EFER_LMA) { 1605 if (efer & EFER_LMA) {
1551 vmcs_write32(VM_ENTRY_CONTROLS, 1606 vmcs_write32(VM_ENTRY_CONTROLS,
1552 vmcs_read32(VM_ENTRY_CONTROLS) | 1607 vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1576,13 +1631,13 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1576 (guest_tr_ar & ~AR_TYPE_MASK) 1631 (guest_tr_ar & ~AR_TYPE_MASK)
1577 | AR_TYPE_BUSY_64_TSS); 1632 | AR_TYPE_BUSY_64_TSS);
1578 } 1633 }
1579 vcpu->arch.shadow_efer |= EFER_LMA; 1634 vcpu->arch.efer |= EFER_LMA;
1580 vmx_set_efer(vcpu, vcpu->arch.shadow_efer); 1635 vmx_set_efer(vcpu, vcpu->arch.efer);
1581} 1636}
1582 1637
1583static void exit_lmode(struct kvm_vcpu *vcpu) 1638static void exit_lmode(struct kvm_vcpu *vcpu)
1584{ 1639{
1585 vcpu->arch.shadow_efer &= ~EFER_LMA; 1640 vcpu->arch.efer &= ~EFER_LMA;
1586 1641
1587 vmcs_write32(VM_ENTRY_CONTROLS, 1642 vmcs_write32(VM_ENTRY_CONTROLS,
1588 vmcs_read32(VM_ENTRY_CONTROLS) 1643 vmcs_read32(VM_ENTRY_CONTROLS)
@@ -1598,10 +1653,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1598 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 1653 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1599} 1654}
1600 1655
1656static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1657{
1658 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
1659
1660 vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
1661 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
1662}
1663
1601static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1664static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1602{ 1665{
1603 vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; 1666 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
1604 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; 1667
1668 vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
1669 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
1605} 1670}
1606 1671
1607static void ept_load_pdptrs(struct kvm_vcpu *vcpu) 1672static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -1646,7 +1711,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1646 (CPU_BASED_CR3_LOAD_EXITING | 1711 (CPU_BASED_CR3_LOAD_EXITING |
1647 CPU_BASED_CR3_STORE_EXITING)); 1712 CPU_BASED_CR3_STORE_EXITING));
1648 vcpu->arch.cr0 = cr0; 1713 vcpu->arch.cr0 = cr0;
1649 vmx_set_cr4(vcpu, vcpu->arch.cr4); 1714 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1650 } else if (!is_paging(vcpu)) { 1715 } else if (!is_paging(vcpu)) {
1651 /* From nonpaging to paging */ 1716 /* From nonpaging to paging */
1652 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1717 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1654,23 +1719,13 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1654 ~(CPU_BASED_CR3_LOAD_EXITING | 1719 ~(CPU_BASED_CR3_LOAD_EXITING |
1655 CPU_BASED_CR3_STORE_EXITING)); 1720 CPU_BASED_CR3_STORE_EXITING));
1656 vcpu->arch.cr0 = cr0; 1721 vcpu->arch.cr0 = cr0;
1657 vmx_set_cr4(vcpu, vcpu->arch.cr4); 1722 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1658 } 1723 }
1659 1724
1660 if (!(cr0 & X86_CR0_WP)) 1725 if (!(cr0 & X86_CR0_WP))
1661 *hw_cr0 &= ~X86_CR0_WP; 1726 *hw_cr0 &= ~X86_CR0_WP;
1662} 1727}
1663 1728
1664static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
1665 struct kvm_vcpu *vcpu)
1666{
1667 if (!is_paging(vcpu)) {
1668 *hw_cr4 &= ~X86_CR4_PAE;
1669 *hw_cr4 |= X86_CR4_PSE;
1670 } else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
1671 *hw_cr4 &= ~X86_CR4_PAE;
1672}
1673
1674static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1729static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1675{ 1730{
1676 struct vcpu_vmx *vmx = to_vmx(vcpu); 1731 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1682,8 +1737,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1682 else 1737 else
1683 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; 1738 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
1684 1739
1685 vmx_fpu_deactivate(vcpu);
1686
1687 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 1740 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
1688 enter_pmode(vcpu); 1741 enter_pmode(vcpu);
1689 1742
@@ -1691,7 +1744,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1691 enter_rmode(vcpu); 1744 enter_rmode(vcpu);
1692 1745
1693#ifdef CONFIG_X86_64 1746#ifdef CONFIG_X86_64
1694 if (vcpu->arch.shadow_efer & EFER_LME) { 1747 if (vcpu->arch.efer & EFER_LME) {
1695 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) 1748 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1696 enter_lmode(vcpu); 1749 enter_lmode(vcpu);
1697 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) 1750 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
@@ -1702,12 +1755,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1702 if (enable_ept) 1755 if (enable_ept)
1703 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 1756 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
1704 1757
1758 if (!vcpu->fpu_active)
1759 hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
1760
1705 vmcs_writel(CR0_READ_SHADOW, cr0); 1761 vmcs_writel(CR0_READ_SHADOW, cr0);
1706 vmcs_writel(GUEST_CR0, hw_cr0); 1762 vmcs_writel(GUEST_CR0, hw_cr0);
1707 vcpu->arch.cr0 = cr0; 1763 vcpu->arch.cr0 = cr0;
1708
1709 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1710 vmx_fpu_activate(vcpu);
1711} 1764}
1712 1765
1713static u64 construct_eptp(unsigned long root_hpa) 1766static u64 construct_eptp(unsigned long root_hpa)
@@ -1738,8 +1791,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1738 1791
1739 vmx_flush_tlb(vcpu); 1792 vmx_flush_tlb(vcpu);
1740 vmcs_writel(GUEST_CR3, guest_cr3); 1793 vmcs_writel(GUEST_CR3, guest_cr3);
1741 if (vcpu->arch.cr0 & X86_CR0_PE)
1742 vmx_fpu_deactivate(vcpu);
1743} 1794}
1744 1795
1745static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1796static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1748,8 +1799,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1748 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 1799 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
1749 1800
1750 vcpu->arch.cr4 = cr4; 1801 vcpu->arch.cr4 = cr4;
1751 if (enable_ept) 1802 if (enable_ept) {
1752 ept_update_paging_mode_cr4(&hw_cr4, vcpu); 1803 if (!is_paging(vcpu)) {
1804 hw_cr4 &= ~X86_CR4_PAE;
1805 hw_cr4 |= X86_CR4_PSE;
1806 } else if (!(cr4 & X86_CR4_PAE)) {
1807 hw_cr4 &= ~X86_CR4_PAE;
1808 }
1809 }
1753 1810
1754 vmcs_writel(CR4_READ_SHADOW, cr4); 1811 vmcs_writel(CR4_READ_SHADOW, cr4);
1755 vmcs_writel(GUEST_CR4, hw_cr4); 1812 vmcs_writel(GUEST_CR4, hw_cr4);
@@ -1787,7 +1844,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
1787 1844
1788static int vmx_get_cpl(struct kvm_vcpu *vcpu) 1845static int vmx_get_cpl(struct kvm_vcpu *vcpu)
1789{ 1846{
1790 if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ 1847 if (!is_protmode(vcpu))
1791 return 0; 1848 return 0;
1792 1849
1793 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ 1850 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
@@ -2042,7 +2099,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
2042static bool guest_state_valid(struct kvm_vcpu *vcpu) 2099static bool guest_state_valid(struct kvm_vcpu *vcpu)
2043{ 2100{
2044 /* real mode guest state checks */ 2101 /* real mode guest state checks */
2045 if (!(vcpu->arch.cr0 & X86_CR0_PE)) { 2102 if (!is_protmode(vcpu)) {
2046 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 2103 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
2047 return false; 2104 return false;
2048 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 2105 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
@@ -2175,7 +2232,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
2175 struct kvm_userspace_memory_region kvm_userspace_mem; 2232 struct kvm_userspace_memory_region kvm_userspace_mem;
2176 int r = 0; 2233 int r = 0;
2177 2234
2178 down_write(&kvm->slots_lock); 2235 mutex_lock(&kvm->slots_lock);
2179 if (kvm->arch.apic_access_page) 2236 if (kvm->arch.apic_access_page)
2180 goto out; 2237 goto out;
2181 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; 2238 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -2188,7 +2245,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
2188 2245
2189 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); 2246 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
2190out: 2247out:
2191 up_write(&kvm->slots_lock); 2248 mutex_unlock(&kvm->slots_lock);
2192 return r; 2249 return r;
2193} 2250}
2194 2251
@@ -2197,7 +2254,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
2197 struct kvm_userspace_memory_region kvm_userspace_mem; 2254 struct kvm_userspace_memory_region kvm_userspace_mem;
2198 int r = 0; 2255 int r = 0;
2199 2256
2200 down_write(&kvm->slots_lock); 2257 mutex_lock(&kvm->slots_lock);
2201 if (kvm->arch.ept_identity_pagetable) 2258 if (kvm->arch.ept_identity_pagetable)
2202 goto out; 2259 goto out;
2203 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; 2260 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
@@ -2212,7 +2269,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
2212 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, 2269 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
2213 kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); 2270 kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
2214out: 2271out:
2215 up_write(&kvm->slots_lock); 2272 mutex_unlock(&kvm->slots_lock);
2216 return r; 2273 return r;
2217} 2274}
2218 2275
@@ -2384,14 +2441,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2384 for (i = 0; i < NR_VMX_MSR; ++i) { 2441 for (i = 0; i < NR_VMX_MSR; ++i) {
2385 u32 index = vmx_msr_index[i]; 2442 u32 index = vmx_msr_index[i];
2386 u32 data_low, data_high; 2443 u32 data_low, data_high;
2387 u64 data;
2388 int j = vmx->nmsrs; 2444 int j = vmx->nmsrs;
2389 2445
2390 if (rdmsr_safe(index, &data_low, &data_high) < 0) 2446 if (rdmsr_safe(index, &data_low, &data_high) < 0)
2391 continue; 2447 continue;
2392 if (wrmsr_safe(index, data_low, data_high) < 0) 2448 if (wrmsr_safe(index, data_low, data_high) < 0)
2393 continue; 2449 continue;
2394 data = data_low | ((u64)data_high << 32);
2395 vmx->guest_msrs[j].index = i; 2450 vmx->guest_msrs[j].index = i;
2396 vmx->guest_msrs[j].data = 0; 2451 vmx->guest_msrs[j].data = 0;
2397 vmx->guest_msrs[j].mask = -1ull; 2452 vmx->guest_msrs[j].mask = -1ull;
@@ -2404,7 +2459,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2404 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); 2459 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
2405 2460
2406 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 2461 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
2407 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); 2462 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
2463 if (enable_ept)
2464 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
2465 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
2408 2466
2409 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; 2467 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
2410 rdtscll(tsc_this); 2468 rdtscll(tsc_this);
@@ -2429,10 +2487,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2429{ 2487{
2430 struct vcpu_vmx *vmx = to_vmx(vcpu); 2488 struct vcpu_vmx *vmx = to_vmx(vcpu);
2431 u64 msr; 2489 u64 msr;
2432 int ret; 2490 int ret, idx;
2433 2491
2434 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2492 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2435 down_read(&vcpu->kvm->slots_lock); 2493 idx = srcu_read_lock(&vcpu->kvm->srcu);
2436 if (!init_rmode(vmx->vcpu.kvm)) { 2494 if (!init_rmode(vmx->vcpu.kvm)) {
2437 ret = -ENOMEM; 2495 ret = -ENOMEM;
2438 goto out; 2496 goto out;
@@ -2526,7 +2584,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2526 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2584 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2527 2585
2528 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 2586 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
2529 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ 2587 vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
2530 vmx_set_cr4(&vmx->vcpu, 0); 2588 vmx_set_cr4(&vmx->vcpu, 0);
2531 vmx_set_efer(&vmx->vcpu, 0); 2589 vmx_set_efer(&vmx->vcpu, 0);
2532 vmx_fpu_activate(&vmx->vcpu); 2590 vmx_fpu_activate(&vmx->vcpu);
@@ -2540,7 +2598,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2540 vmx->emulation_required = 0; 2598 vmx->emulation_required = 0;
2541 2599
2542out: 2600out:
2543 up_read(&vcpu->kvm->slots_lock); 2601 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2544 return ret; 2602 return ret;
2545} 2603}
2546 2604
@@ -2717,6 +2775,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2717 kvm_queue_exception(vcpu, vec); 2775 kvm_queue_exception(vcpu, vec);
2718 return 1; 2776 return 1;
2719 case BP_VECTOR: 2777 case BP_VECTOR:
2778 /*
2779 * Update instruction length as we may reinject the exception
2780 * from user space while in guest debugging mode.
2781 */
2782 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
2783 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2720 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 2784 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
2721 return 0; 2785 return 0;
2722 /* fall through */ 2786 /* fall through */
@@ -2839,6 +2903,13 @@ static int handle_exception(struct kvm_vcpu *vcpu)
2839 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 2903 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
2840 /* fall through */ 2904 /* fall through */
2841 case BP_VECTOR: 2905 case BP_VECTOR:
2906 /*
2907 * Update instruction length as we may reinject #BP from
2908 * user space while in guest debugging mode. Reading it for
2909 * #DB as well causes no harm, it is not used in that case.
2910 */
2911 vmx->vcpu.arch.event_exit_inst_len =
2912 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2842 kvm_run->exit_reason = KVM_EXIT_DEBUG; 2913 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2843 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; 2914 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
2844 kvm_run->debug.arch.exception = ex_no; 2915 kvm_run->debug.arch.exception = ex_no;
@@ -2940,11 +3011,10 @@ static int handle_cr(struct kvm_vcpu *vcpu)
2940 }; 3011 };
2941 break; 3012 break;
2942 case 2: /* clts */ 3013 case 2: /* clts */
2943 vmx_fpu_deactivate(vcpu); 3014 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
2944 vcpu->arch.cr0 &= ~X86_CR0_TS; 3015 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
2945 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
2946 vmx_fpu_activate(vcpu);
2947 skip_emulated_instruction(vcpu); 3016 skip_emulated_instruction(vcpu);
3017 vmx_fpu_activate(vcpu);
2948 return 1; 3018 return 1;
2949 case 1: /*mov from cr*/ 3019 case 1: /*mov from cr*/
2950 switch (cr) { 3020 switch (cr) {
@@ -2962,7 +3032,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
2962 } 3032 }
2963 break; 3033 break;
2964 case 3: /* lmsw */ 3034 case 3: /* lmsw */
2965 kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); 3035 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
3036 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
3037 kvm_lmsw(vcpu, val);
2966 3038
2967 skip_emulated_instruction(vcpu); 3039 skip_emulated_instruction(vcpu);
2968 return 1; 3040 return 1;
@@ -2975,12 +3047,22 @@ static int handle_cr(struct kvm_vcpu *vcpu)
2975 return 0; 3047 return 0;
2976} 3048}
2977 3049
3050static int check_dr_alias(struct kvm_vcpu *vcpu)
3051{
3052 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
3053 kvm_queue_exception(vcpu, UD_VECTOR);
3054 return -1;
3055 }
3056 return 0;
3057}
3058
2978static int handle_dr(struct kvm_vcpu *vcpu) 3059static int handle_dr(struct kvm_vcpu *vcpu)
2979{ 3060{
2980 unsigned long exit_qualification; 3061 unsigned long exit_qualification;
2981 unsigned long val; 3062 unsigned long val;
2982 int dr, reg; 3063 int dr, reg;
2983 3064
3065 /* Do not handle if the CPL > 0, will trigger GP on re-entry */
2984 if (!kvm_require_cpl(vcpu, 0)) 3066 if (!kvm_require_cpl(vcpu, 0))
2985 return 1; 3067 return 1;
2986 dr = vmcs_readl(GUEST_DR7); 3068 dr = vmcs_readl(GUEST_DR7);
@@ -3016,14 +3098,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)
3016 case 0 ... 3: 3098 case 0 ... 3:
3017 val = vcpu->arch.db[dr]; 3099 val = vcpu->arch.db[dr];
3018 break; 3100 break;
3101 case 4:
3102 if (check_dr_alias(vcpu) < 0)
3103 return 1;
3104 /* fall through */
3019 case 6: 3105 case 6:
3020 val = vcpu->arch.dr6; 3106 val = vcpu->arch.dr6;
3021 break; 3107 break;
3022 case 7: 3108 case 5:
3109 if (check_dr_alias(vcpu) < 0)
3110 return 1;
3111 /* fall through */
3112 default: /* 7 */
3023 val = vcpu->arch.dr7; 3113 val = vcpu->arch.dr7;
3024 break; 3114 break;
3025 default:
3026 val = 0;
3027 } 3115 }
3028 kvm_register_write(vcpu, reg, val); 3116 kvm_register_write(vcpu, reg, val);
3029 } else { 3117 } else {
@@ -3034,21 +3122,25 @@ static int handle_dr(struct kvm_vcpu *vcpu)
3034 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 3122 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
3035 vcpu->arch.eff_db[dr] = val; 3123 vcpu->arch.eff_db[dr] = val;
3036 break; 3124 break;
3037 case 4 ... 5: 3125 case 4:
3038 if (vcpu->arch.cr4 & X86_CR4_DE) 3126 if (check_dr_alias(vcpu) < 0)
3039 kvm_queue_exception(vcpu, UD_VECTOR); 3127 return 1;
3040 break; 3128 /* fall through */
3041 case 6: 3129 case 6:
3042 if (val & 0xffffffff00000000ULL) { 3130 if (val & 0xffffffff00000000ULL) {
3043 kvm_queue_exception(vcpu, GP_VECTOR); 3131 kvm_inject_gp(vcpu, 0);
3044 break; 3132 return 1;
3045 } 3133 }
3046 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; 3134 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
3047 break; 3135 break;
3048 case 7: 3136 case 5:
3137 if (check_dr_alias(vcpu) < 0)
3138 return 1;
3139 /* fall through */
3140 default: /* 7 */
3049 if (val & 0xffffffff00000000ULL) { 3141 if (val & 0xffffffff00000000ULL) {
3050 kvm_queue_exception(vcpu, GP_VECTOR); 3142 kvm_inject_gp(vcpu, 0);
3051 break; 3143 return 1;
3052 } 3144 }
3053 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 3145 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
3054 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 3146 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
@@ -3075,6 +3167,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
3075 u64 data; 3167 u64 data;
3076 3168
3077 if (vmx_get_msr(vcpu, ecx, &data)) { 3169 if (vmx_get_msr(vcpu, ecx, &data)) {
3170 trace_kvm_msr_read_ex(ecx);
3078 kvm_inject_gp(vcpu, 0); 3171 kvm_inject_gp(vcpu, 0);
3079 return 1; 3172 return 1;
3080 } 3173 }
@@ -3094,13 +3187,13 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
3094 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 3187 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
3095 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 3188 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
3096 3189
3097 trace_kvm_msr_write(ecx, data);
3098
3099 if (vmx_set_msr(vcpu, ecx, data) != 0) { 3190 if (vmx_set_msr(vcpu, ecx, data) != 0) {
3191 trace_kvm_msr_write_ex(ecx, data);
3100 kvm_inject_gp(vcpu, 0); 3192 kvm_inject_gp(vcpu, 0);
3101 return 1; 3193 return 1;
3102 } 3194 }
3103 3195
3196 trace_kvm_msr_write(ecx, data);
3104 skip_emulated_instruction(vcpu); 3197 skip_emulated_instruction(vcpu);
3105 return 1; 3198 return 1;
3106} 3199}
@@ -3385,7 +3478,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3385 } 3478 }
3386 3479
3387 if (err != EMULATE_DONE) { 3480 if (err != EMULATE_DONE) {
3388 kvm_report_emulation_failure(vcpu, "emulation failure");
3389 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3481 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3390 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 3482 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3391 vcpu->run->internal.ndata = 0; 3483 vcpu->run->internal.ndata = 0;
@@ -3416,6 +3508,12 @@ static int handle_pause(struct kvm_vcpu *vcpu)
3416 return 1; 3508 return 1;
3417} 3509}
3418 3510
3511static int handle_invalid_op(struct kvm_vcpu *vcpu)
3512{
3513 kvm_queue_exception(vcpu, UD_VECTOR);
3514 return 1;
3515}
3516
3419/* 3517/*
3420 * The exit handlers return 1 if the exit was handled fully and guest execution 3518 * The exit handlers return 1 if the exit was handled fully and guest execution
3421 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 3519 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -3453,6 +3551,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3453 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3551 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3454 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 3552 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
3455 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 3553 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
3554 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
3555 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
3456}; 3556};
3457 3557
3458static const int kvm_vmx_max_exit_handlers = 3558static const int kvm_vmx_max_exit_handlers =
@@ -3686,9 +3786,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3686 */ 3786 */
3687 vmcs_writel(HOST_CR0, read_cr0()); 3787 vmcs_writel(HOST_CR0, read_cr0());
3688 3788
3689 if (vcpu->arch.switch_db_regs)
3690 set_debugreg(vcpu->arch.dr6, 6);
3691
3692 asm( 3789 asm(
3693 /* Store host registers */ 3790 /* Store host registers */
3694 "push %%"R"dx; push %%"R"bp;" 3791 "push %%"R"dx; push %%"R"bp;"
@@ -3789,9 +3886,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3789 | (1 << VCPU_EXREG_PDPTR)); 3886 | (1 << VCPU_EXREG_PDPTR));
3790 vcpu->arch.regs_dirty = 0; 3887 vcpu->arch.regs_dirty = 0;
3791 3888
3792 if (vcpu->arch.switch_db_regs)
3793 get_debugreg(vcpu->arch.dr6, 6);
3794
3795 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 3889 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
3796 if (vmx->rmode.irq.pending) 3890 if (vmx->rmode.irq.pending)
3797 fixup_rmode_irq(vmx); 3891 fixup_rmode_irq(vmx);
@@ -3920,7 +4014,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3920 * b. VT-d with snooping control feature: snooping control feature of 4014 * b. VT-d with snooping control feature: snooping control feature of
3921 * VT-d engine can guarantee the cache correctness. Just set it 4015 * VT-d engine can guarantee the cache correctness. Just set it
3922 * to WB to keep consistent with host. So the same as item 3. 4016 * to WB to keep consistent with host. So the same as item 3.
3923 * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep 4017 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
3924 * consistent with host MTRR 4018 * consistent with host MTRR
3925 */ 4019 */
3926 if (is_mmio) 4020 if (is_mmio)
@@ -3931,37 +4025,88 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3931 VMX_EPT_MT_EPTE_SHIFT; 4025 VMX_EPT_MT_EPTE_SHIFT;
3932 else 4026 else
3933 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) 4027 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
3934 | VMX_EPT_IGMT_BIT; 4028 | VMX_EPT_IPAT_BIT;
3935 4029
3936 return ret; 4030 return ret;
3937} 4031}
3938 4032
4033#define _ER(x) { EXIT_REASON_##x, #x }
4034
3939static const struct trace_print_flags vmx_exit_reasons_str[] = { 4035static const struct trace_print_flags vmx_exit_reasons_str[] = {
3940 { EXIT_REASON_EXCEPTION_NMI, "exception" }, 4036 _ER(EXCEPTION_NMI),
3941 { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, 4037 _ER(EXTERNAL_INTERRUPT),
3942 { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, 4038 _ER(TRIPLE_FAULT),
3943 { EXIT_REASON_NMI_WINDOW, "nmi_window" }, 4039 _ER(PENDING_INTERRUPT),
3944 { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, 4040 _ER(NMI_WINDOW),
3945 { EXIT_REASON_CR_ACCESS, "cr_access" }, 4041 _ER(TASK_SWITCH),
3946 { EXIT_REASON_DR_ACCESS, "dr_access" }, 4042 _ER(CPUID),
3947 { EXIT_REASON_CPUID, "cpuid" }, 4043 _ER(HLT),
3948 { EXIT_REASON_MSR_READ, "rdmsr" }, 4044 _ER(INVLPG),
3949 { EXIT_REASON_MSR_WRITE, "wrmsr" }, 4045 _ER(RDPMC),
3950 { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, 4046 _ER(RDTSC),
3951 { EXIT_REASON_HLT, "halt" }, 4047 _ER(VMCALL),
3952 { EXIT_REASON_INVLPG, "invlpg" }, 4048 _ER(VMCLEAR),
3953 { EXIT_REASON_VMCALL, "hypercall" }, 4049 _ER(VMLAUNCH),
3954 { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, 4050 _ER(VMPTRLD),
3955 { EXIT_REASON_APIC_ACCESS, "apic_access" }, 4051 _ER(VMPTRST),
3956 { EXIT_REASON_WBINVD, "wbinvd" }, 4052 _ER(VMREAD),
3957 { EXIT_REASON_TASK_SWITCH, "task_switch" }, 4053 _ER(VMRESUME),
3958 { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, 4054 _ER(VMWRITE),
4055 _ER(VMOFF),
4056 _ER(VMON),
4057 _ER(CR_ACCESS),
4058 _ER(DR_ACCESS),
4059 _ER(IO_INSTRUCTION),
4060 _ER(MSR_READ),
4061 _ER(MSR_WRITE),
4062 _ER(MWAIT_INSTRUCTION),
4063 _ER(MONITOR_INSTRUCTION),
4064 _ER(PAUSE_INSTRUCTION),
4065 _ER(MCE_DURING_VMENTRY),
4066 _ER(TPR_BELOW_THRESHOLD),
4067 _ER(APIC_ACCESS),
4068 _ER(EPT_VIOLATION),
4069 _ER(EPT_MISCONFIG),
4070 _ER(WBINVD),
3959 { -1, NULL } 4071 { -1, NULL }
3960}; 4072};
3961 4073
3962static bool vmx_gb_page_enable(void) 4074#undef _ER
4075
4076static int vmx_get_lpage_level(void)
4077{
4078 if (enable_ept && !cpu_has_vmx_ept_1g_page())
4079 return PT_DIRECTORY_LEVEL;
4080 else
4081 /* For shadow and EPT supported 1GB page */
4082 return PT_PDPE_LEVEL;
4083}
4084
4085static inline u32 bit(int bitno)
4086{
4087 return 1 << (bitno & 31);
4088}
4089
4090static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
3963{ 4091{
3964 return false; 4092 struct kvm_cpuid_entry2 *best;
4093 struct vcpu_vmx *vmx = to_vmx(vcpu);
4094 u32 exec_control;
4095
4096 vmx->rdtscp_enabled = false;
4097 if (vmx_rdtscp_supported()) {
4098 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
4099 if (exec_control & SECONDARY_EXEC_RDTSCP) {
4100 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
4101 if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
4102 vmx->rdtscp_enabled = true;
4103 else {
4104 exec_control &= ~SECONDARY_EXEC_RDTSCP;
4105 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
4106 exec_control);
4107 }
4108 }
4109 }
3965} 4110}
3966 4111
3967static struct kvm_x86_ops vmx_x86_ops = { 4112static struct kvm_x86_ops vmx_x86_ops = {
@@ -3990,6 +4135,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
3990 .set_segment = vmx_set_segment, 4135 .set_segment = vmx_set_segment,
3991 .get_cpl = vmx_get_cpl, 4136 .get_cpl = vmx_get_cpl,
3992 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 4137 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
4138 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
3993 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 4139 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
3994 .set_cr0 = vmx_set_cr0, 4140 .set_cr0 = vmx_set_cr0,
3995 .set_cr3 = vmx_set_cr3, 4141 .set_cr3 = vmx_set_cr3,
@@ -4002,6 +4148,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
4002 .cache_reg = vmx_cache_reg, 4148 .cache_reg = vmx_cache_reg,
4003 .get_rflags = vmx_get_rflags, 4149 .get_rflags = vmx_get_rflags,
4004 .set_rflags = vmx_set_rflags, 4150 .set_rflags = vmx_set_rflags,
4151 .fpu_activate = vmx_fpu_activate,
4152 .fpu_deactivate = vmx_fpu_deactivate,
4005 4153
4006 .tlb_flush = vmx_flush_tlb, 4154 .tlb_flush = vmx_flush_tlb,
4007 4155
@@ -4027,7 +4175,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
4027 .get_mt_mask = vmx_get_mt_mask, 4175 .get_mt_mask = vmx_get_mt_mask,
4028 4176
4029 .exit_reasons_str = vmx_exit_reasons_str, 4177 .exit_reasons_str = vmx_exit_reasons_str,
4030 .gb_page_enable = vmx_gb_page_enable, 4178 .get_lpage_level = vmx_get_lpage_level,
4179
4180 .cpuid_update = vmx_cpuid_update,
4181
4182 .rdtscp_supported = vmx_rdtscp_supported,
4031}; 4183};
4032 4184
4033static int __init vmx_init(void) 4185static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1ddcad452add..e46282a56565 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -38,6 +38,7 @@
38#include <linux/intel-iommu.h> 38#include <linux/intel-iommu.h>
39#include <linux/cpufreq.h> 39#include <linux/cpufreq.h>
40#include <linux/user-return-notifier.h> 40#include <linux/user-return-notifier.h>
41#include <linux/srcu.h>
41#include <trace/events/kvm.h> 42#include <trace/events/kvm.h>
42#undef TRACE_INCLUDE_FILE 43#undef TRACE_INCLUDE_FILE
43#define CREATE_TRACE_POINTS 44#define CREATE_TRACE_POINTS
@@ -93,16 +94,16 @@ module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
93 94
94struct kvm_shared_msrs_global { 95struct kvm_shared_msrs_global {
95 int nr; 96 int nr;
96 struct kvm_shared_msr { 97 u32 msrs[KVM_NR_SHARED_MSRS];
97 u32 msr;
98 u64 value;
99 } msrs[KVM_NR_SHARED_MSRS];
100}; 98};
101 99
102struct kvm_shared_msrs { 100struct kvm_shared_msrs {
103 struct user_return_notifier urn; 101 struct user_return_notifier urn;
104 bool registered; 102 bool registered;
105 u64 current_value[KVM_NR_SHARED_MSRS]; 103 struct kvm_shared_msr_values {
104 u64 host;
105 u64 curr;
106 } values[KVM_NR_SHARED_MSRS];
106}; 107};
107 108
108static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; 109static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
@@ -147,53 +148,64 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
147static void kvm_on_user_return(struct user_return_notifier *urn) 148static void kvm_on_user_return(struct user_return_notifier *urn)
148{ 149{
149 unsigned slot; 150 unsigned slot;
150 struct kvm_shared_msr *global;
151 struct kvm_shared_msrs *locals 151 struct kvm_shared_msrs *locals
152 = container_of(urn, struct kvm_shared_msrs, urn); 152 = container_of(urn, struct kvm_shared_msrs, urn);
153 struct kvm_shared_msr_values *values;
153 154
154 for (slot = 0; slot < shared_msrs_global.nr; ++slot) { 155 for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
155 global = &shared_msrs_global.msrs[slot]; 156 values = &locals->values[slot];
156 if (global->value != locals->current_value[slot]) { 157 if (values->host != values->curr) {
157 wrmsrl(global->msr, global->value); 158 wrmsrl(shared_msrs_global.msrs[slot], values->host);
158 locals->current_value[slot] = global->value; 159 values->curr = values->host;
159 } 160 }
160 } 161 }
161 locals->registered = false; 162 locals->registered = false;
162 user_return_notifier_unregister(urn); 163 user_return_notifier_unregister(urn);
163} 164}
164 165
165void kvm_define_shared_msr(unsigned slot, u32 msr) 166static void shared_msr_update(unsigned slot, u32 msr)
166{ 167{
167 int cpu; 168 struct kvm_shared_msrs *smsr;
168 u64 value; 169 u64 value;
169 170
171 smsr = &__get_cpu_var(shared_msrs);
172 /* only read, and nobody should modify it at this time,
173 * so don't need lock */
174 if (slot >= shared_msrs_global.nr) {
175 printk(KERN_ERR "kvm: invalid MSR slot!");
176 return;
177 }
178 rdmsrl_safe(msr, &value);
179 smsr->values[slot].host = value;
180 smsr->values[slot].curr = value;
181}
182
183void kvm_define_shared_msr(unsigned slot, u32 msr)
184{
170 if (slot >= shared_msrs_global.nr) 185 if (slot >= shared_msrs_global.nr)
171 shared_msrs_global.nr = slot + 1; 186 shared_msrs_global.nr = slot + 1;
172 shared_msrs_global.msrs[slot].msr = msr; 187 shared_msrs_global.msrs[slot] = msr;
173 rdmsrl_safe(msr, &value); 188 /* we need ensured the shared_msr_global have been updated */
174 shared_msrs_global.msrs[slot].value = value; 189 smp_wmb();
175 for_each_online_cpu(cpu)
176 per_cpu(shared_msrs, cpu).current_value[slot] = value;
177} 190}
178EXPORT_SYMBOL_GPL(kvm_define_shared_msr); 191EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
179 192
180static void kvm_shared_msr_cpu_online(void) 193static void kvm_shared_msr_cpu_online(void)
181{ 194{
182 unsigned i; 195 unsigned i;
183 struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
184 196
185 for (i = 0; i < shared_msrs_global.nr; ++i) 197 for (i = 0; i < shared_msrs_global.nr; ++i)
186 locals->current_value[i] = shared_msrs_global.msrs[i].value; 198 shared_msr_update(i, shared_msrs_global.msrs[i]);
187} 199}
188 200
189void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) 201void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
190{ 202{
191 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); 203 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
192 204
193 if (((value ^ smsr->current_value[slot]) & mask) == 0) 205 if (((value ^ smsr->values[slot].curr) & mask) == 0)
194 return; 206 return;
195 smsr->current_value[slot] = value; 207 smsr->values[slot].curr = value;
196 wrmsrl(shared_msrs_global.msrs[slot].msr, value); 208 wrmsrl(shared_msrs_global.msrs[slot], value);
197 if (!smsr->registered) { 209 if (!smsr->registered) {
198 smsr->urn.on_user_return = kvm_on_user_return; 210 smsr->urn.on_user_return = kvm_on_user_return;
199 user_return_notifier_register(&smsr->urn); 211 user_return_notifier_register(&smsr->urn);
@@ -257,12 +269,68 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
257} 269}
258EXPORT_SYMBOL_GPL(kvm_set_apic_base); 270EXPORT_SYMBOL_GPL(kvm_set_apic_base);
259 271
272#define EXCPT_BENIGN 0
273#define EXCPT_CONTRIBUTORY 1
274#define EXCPT_PF 2
275
276static int exception_class(int vector)
277{
278 switch (vector) {
279 case PF_VECTOR:
280 return EXCPT_PF;
281 case DE_VECTOR:
282 case TS_VECTOR:
283 case NP_VECTOR:
284 case SS_VECTOR:
285 case GP_VECTOR:
286 return EXCPT_CONTRIBUTORY;
287 default:
288 break;
289 }
290 return EXCPT_BENIGN;
291}
292
293static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
294 unsigned nr, bool has_error, u32 error_code)
295{
296 u32 prev_nr;
297 int class1, class2;
298
299 if (!vcpu->arch.exception.pending) {
300 queue:
301 vcpu->arch.exception.pending = true;
302 vcpu->arch.exception.has_error_code = has_error;
303 vcpu->arch.exception.nr = nr;
304 vcpu->arch.exception.error_code = error_code;
305 return;
306 }
307
308 /* to check exception */
309 prev_nr = vcpu->arch.exception.nr;
310 if (prev_nr == DF_VECTOR) {
311 /* triple fault -> shutdown */
312 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
313 return;
314 }
315 class1 = exception_class(prev_nr);
316 class2 = exception_class(nr);
317 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
318 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
319 /* generate double fault per SDM Table 5-5 */
320 vcpu->arch.exception.pending = true;
321 vcpu->arch.exception.has_error_code = true;
322 vcpu->arch.exception.nr = DF_VECTOR;
323 vcpu->arch.exception.error_code = 0;
324 } else
325 /* replace previous exception with a new one in a hope
326 that instruction re-execution will regenerate lost
327 exception */
328 goto queue;
329}
330
260void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 331void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
261{ 332{
262 WARN_ON(vcpu->arch.exception.pending); 333 kvm_multiple_exception(vcpu, nr, false, 0);
263 vcpu->arch.exception.pending = true;
264 vcpu->arch.exception.has_error_code = false;
265 vcpu->arch.exception.nr = nr;
266} 334}
267EXPORT_SYMBOL_GPL(kvm_queue_exception); 335EXPORT_SYMBOL_GPL(kvm_queue_exception);
268 336
@@ -270,25 +338,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
270 u32 error_code) 338 u32 error_code)
271{ 339{
272 ++vcpu->stat.pf_guest; 340 ++vcpu->stat.pf_guest;
273
274 if (vcpu->arch.exception.pending) {
275 switch(vcpu->arch.exception.nr) {
276 case DF_VECTOR:
277 /* triple fault -> shutdown */
278 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
279 return;
280 case PF_VECTOR:
281 vcpu->arch.exception.nr = DF_VECTOR;
282 vcpu->arch.exception.error_code = 0;
283 return;
284 default:
285 /* replace previous exception with a new one in a hope
286 that instruction re-execution will regenerate lost
287 exception */
288 vcpu->arch.exception.pending = false;
289 break;
290 }
291 }
292 vcpu->arch.cr2 = addr; 341 vcpu->arch.cr2 = addr;
293 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 342 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
294} 343}
@@ -301,11 +350,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
301 350
302void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 351void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
303{ 352{
304 WARN_ON(vcpu->arch.exception.pending); 353 kvm_multiple_exception(vcpu, nr, true, error_code);
305 vcpu->arch.exception.pending = true;
306 vcpu->arch.exception.has_error_code = true;
307 vcpu->arch.exception.nr = nr;
308 vcpu->arch.exception.error_code = error_code;
309} 354}
310EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 355EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
311 356
@@ -383,12 +428,18 @@ out:
383 428
384void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 429void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
385{ 430{
386 if (cr0 & CR0_RESERVED_BITS) { 431 cr0 |= X86_CR0_ET;
432
433#ifdef CONFIG_X86_64
434 if (cr0 & 0xffffffff00000000UL) {
387 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 435 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
388 cr0, vcpu->arch.cr0); 436 cr0, kvm_read_cr0(vcpu));
389 kvm_inject_gp(vcpu, 0); 437 kvm_inject_gp(vcpu, 0);
390 return; 438 return;
391 } 439 }
440#endif
441
442 cr0 &= ~CR0_RESERVED_BITS;
392 443
393 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 444 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
394 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 445 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
@@ -405,7 +456,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
405 456
406 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 457 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
407#ifdef CONFIG_X86_64 458#ifdef CONFIG_X86_64
408 if ((vcpu->arch.shadow_efer & EFER_LME)) { 459 if ((vcpu->arch.efer & EFER_LME)) {
409 int cs_db, cs_l; 460 int cs_db, cs_l;
410 461
411 if (!is_pae(vcpu)) { 462 if (!is_pae(vcpu)) {
@@ -443,13 +494,13 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
443 494
444void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 495void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
445{ 496{
446 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 497 kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
447} 498}
448EXPORT_SYMBOL_GPL(kvm_lmsw); 499EXPORT_SYMBOL_GPL(kvm_lmsw);
449 500
450void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 501void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
451{ 502{
452 unsigned long old_cr4 = vcpu->arch.cr4; 503 unsigned long old_cr4 = kvm_read_cr4(vcpu);
453 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 504 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
454 505
455 if (cr4 & CR4_RESERVED_BITS) { 506 if (cr4 & CR4_RESERVED_BITS) {
@@ -575,9 +626,11 @@ static inline u32 bit(int bitno)
575 * kvm-specific. Those are put in the beginning of the list. 626 * kvm-specific. Those are put in the beginning of the list.
576 */ 627 */
577 628
578#define KVM_SAVE_MSRS_BEGIN 2 629#define KVM_SAVE_MSRS_BEGIN 5
579static u32 msrs_to_save[] = { 630static u32 msrs_to_save[] = {
580 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 631 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
632 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
633 HV_X64_MSR_APIC_ASSIST_PAGE,
581 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 634 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
582 MSR_K6_STAR, 635 MSR_K6_STAR,
583#ifdef CONFIG_X86_64 636#ifdef CONFIG_X86_64
@@ -602,7 +655,7 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
602 } 655 }
603 656
604 if (is_paging(vcpu) 657 if (is_paging(vcpu)
605 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 658 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
606 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); 659 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
607 kvm_inject_gp(vcpu, 0); 660 kvm_inject_gp(vcpu, 0);
608 return; 661 return;
@@ -633,9 +686,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
633 kvm_x86_ops->set_efer(vcpu, efer); 686 kvm_x86_ops->set_efer(vcpu, efer);
634 687
635 efer &= ~EFER_LMA; 688 efer &= ~EFER_LMA;
636 efer |= vcpu->arch.shadow_efer & EFER_LMA; 689 efer |= vcpu->arch.efer & EFER_LMA;
637 690
638 vcpu->arch.shadow_efer = efer; 691 vcpu->arch.efer = efer;
639 692
640 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 693 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
641 kvm_mmu_reset_context(vcpu); 694 kvm_mmu_reset_context(vcpu);
@@ -670,7 +723,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
670{ 723{
671 static int version; 724 static int version;
672 struct pvclock_wall_clock wc; 725 struct pvclock_wall_clock wc;
673 struct timespec now, sys, boot; 726 struct timespec boot;
674 727
675 if (!wall_clock) 728 if (!wall_clock)
676 return; 729 return;
@@ -685,9 +738,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
685 * wall clock specified here. guest system time equals host 738 * wall clock specified here. guest system time equals host
686 * system time for us, thus we must fill in host boot time here. 739 * system time for us, thus we must fill in host boot time here.
687 */ 740 */
688 now = current_kernel_time(); 741 getboottime(&boot);
689 ktime_get_ts(&sys);
690 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
691 742
692 wc.sec = boot.tv_sec; 743 wc.sec = boot.tv_sec;
693 wc.nsec = boot.tv_nsec; 744 wc.nsec = boot.tv_nsec;
@@ -762,6 +813,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
762 local_irq_save(flags); 813 local_irq_save(flags);
763 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); 814 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
764 ktime_get_ts(&ts); 815 ktime_get_ts(&ts);
816 monotonic_to_bootbased(&ts);
765 local_irq_restore(flags); 817 local_irq_restore(flags);
766 818
767 /* With all the info we got, fill in the values */ 819 /* With all the info we got, fill in the values */
@@ -958,6 +1010,100 @@ out:
958 return r; 1010 return r;
959} 1011}
960 1012
1013static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
1014{
1015 return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
1016}
1017
1018static bool kvm_hv_msr_partition_wide(u32 msr)
1019{
1020 bool r = false;
1021 switch (msr) {
1022 case HV_X64_MSR_GUEST_OS_ID:
1023 case HV_X64_MSR_HYPERCALL:
1024 r = true;
1025 break;
1026 }
1027
1028 return r;
1029}
1030
1031static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1032{
1033 struct kvm *kvm = vcpu->kvm;
1034
1035 switch (msr) {
1036 case HV_X64_MSR_GUEST_OS_ID:
1037 kvm->arch.hv_guest_os_id = data;
1038 /* setting guest os id to zero disables hypercall page */
1039 if (!kvm->arch.hv_guest_os_id)
1040 kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
1041 break;
1042 case HV_X64_MSR_HYPERCALL: {
1043 u64 gfn;
1044 unsigned long addr;
1045 u8 instructions[4];
1046
1047 /* if guest os id is not set hypercall should remain disabled */
1048 if (!kvm->arch.hv_guest_os_id)
1049 break;
1050 if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
1051 kvm->arch.hv_hypercall = data;
1052 break;
1053 }
1054 gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
1055 addr = gfn_to_hva(kvm, gfn);
1056 if (kvm_is_error_hva(addr))
1057 return 1;
1058 kvm_x86_ops->patch_hypercall(vcpu, instructions);
1059 ((unsigned char *)instructions)[3] = 0xc3; /* ret */
1060 if (copy_to_user((void __user *)addr, instructions, 4))
1061 return 1;
1062 kvm->arch.hv_hypercall = data;
1063 break;
1064 }
1065 default:
1066 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1067 "data 0x%llx\n", msr, data);
1068 return 1;
1069 }
1070 return 0;
1071}
1072
1073static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1074{
1075 switch (msr) {
1076 case HV_X64_MSR_APIC_ASSIST_PAGE: {
1077 unsigned long addr;
1078
1079 if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
1080 vcpu->arch.hv_vapic = data;
1081 break;
1082 }
1083 addr = gfn_to_hva(vcpu->kvm, data >>
1084 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
1085 if (kvm_is_error_hva(addr))
1086 return 1;
1087 if (clear_user((void __user *)addr, PAGE_SIZE))
1088 return 1;
1089 vcpu->arch.hv_vapic = data;
1090 break;
1091 }
1092 case HV_X64_MSR_EOI:
1093 return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
1094 case HV_X64_MSR_ICR:
1095 return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
1096 case HV_X64_MSR_TPR:
1097 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
1098 default:
1099 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1100 "data 0x%llx\n", msr, data);
1101 return 1;
1102 }
1103
1104 return 0;
1105}
1106
961int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1107int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
962{ 1108{
963 switch (msr) { 1109 switch (msr) {
@@ -1072,6 +1218,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1072 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1218 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1073 "0x%x data 0x%llx\n", msr, data); 1219 "0x%x data 0x%llx\n", msr, data);
1074 break; 1220 break;
1221 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1222 if (kvm_hv_msr_partition_wide(msr)) {
1223 int r;
1224 mutex_lock(&vcpu->kvm->lock);
1225 r = set_msr_hyperv_pw(vcpu, msr, data);
1226 mutex_unlock(&vcpu->kvm->lock);
1227 return r;
1228 } else
1229 return set_msr_hyperv(vcpu, msr, data);
1230 break;
1075 default: 1231 default:
1076 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1232 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1077 return xen_hvm_config(vcpu, data); 1233 return xen_hvm_config(vcpu, data);
@@ -1171,6 +1327,54 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1171 return 0; 1327 return 0;
1172} 1328}
1173 1329
1330static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1331{
1332 u64 data = 0;
1333 struct kvm *kvm = vcpu->kvm;
1334
1335 switch (msr) {
1336 case HV_X64_MSR_GUEST_OS_ID:
1337 data = kvm->arch.hv_guest_os_id;
1338 break;
1339 case HV_X64_MSR_HYPERCALL:
1340 data = kvm->arch.hv_hypercall;
1341 break;
1342 default:
1343 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1344 return 1;
1345 }
1346
1347 *pdata = data;
1348 return 0;
1349}
1350
1351static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1352{
1353 u64 data = 0;
1354
1355 switch (msr) {
1356 case HV_X64_MSR_VP_INDEX: {
1357 int r;
1358 struct kvm_vcpu *v;
1359 kvm_for_each_vcpu(r, v, vcpu->kvm)
1360 if (v == vcpu)
1361 data = r;
1362 break;
1363 }
1364 case HV_X64_MSR_EOI:
1365 return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
1366 case HV_X64_MSR_ICR:
1367 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
1368 case HV_X64_MSR_TPR:
1369 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
1370 default:
1371 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1372 return 1;
1373 }
1374 *pdata = data;
1375 return 0;
1376}
1377
1174int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1378int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1175{ 1379{
1176 u64 data; 1380 u64 data;
@@ -1222,7 +1426,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1222 data |= (((uint64_t)4ULL) << 40); 1426 data |= (((uint64_t)4ULL) << 40);
1223 break; 1427 break;
1224 case MSR_EFER: 1428 case MSR_EFER:
1225 data = vcpu->arch.shadow_efer; 1429 data = vcpu->arch.efer;
1226 break; 1430 break;
1227 case MSR_KVM_WALL_CLOCK: 1431 case MSR_KVM_WALL_CLOCK:
1228 data = vcpu->kvm->arch.wall_clock; 1432 data = vcpu->kvm->arch.wall_clock;
@@ -1237,6 +1441,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1237 case MSR_IA32_MCG_STATUS: 1441 case MSR_IA32_MCG_STATUS:
1238 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1442 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1239 return get_msr_mce(vcpu, msr, pdata); 1443 return get_msr_mce(vcpu, msr, pdata);
1444 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1445 if (kvm_hv_msr_partition_wide(msr)) {
1446 int r;
1447 mutex_lock(&vcpu->kvm->lock);
1448 r = get_msr_hyperv_pw(vcpu, msr, pdata);
1449 mutex_unlock(&vcpu->kvm->lock);
1450 return r;
1451 } else
1452 return get_msr_hyperv(vcpu, msr, pdata);
1453 break;
1240 default: 1454 default:
1241 if (!ignore_msrs) { 1455 if (!ignore_msrs) {
1242 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1456 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -1262,15 +1476,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1262 int (*do_msr)(struct kvm_vcpu *vcpu, 1476 int (*do_msr)(struct kvm_vcpu *vcpu,
1263 unsigned index, u64 *data)) 1477 unsigned index, u64 *data))
1264{ 1478{
1265 int i; 1479 int i, idx;
1266 1480
1267 vcpu_load(vcpu); 1481 vcpu_load(vcpu);
1268 1482
1269 down_read(&vcpu->kvm->slots_lock); 1483 idx = srcu_read_lock(&vcpu->kvm->srcu);
1270 for (i = 0; i < msrs->nmsrs; ++i) 1484 for (i = 0; i < msrs->nmsrs; ++i)
1271 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1485 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1272 break; 1486 break;
1273 up_read(&vcpu->kvm->slots_lock); 1487 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1274 1488
1275 vcpu_put(vcpu); 1489 vcpu_put(vcpu);
1276 1490
@@ -1352,6 +1566,11 @@ int kvm_dev_ioctl_check_extension(long ext)
1352 case KVM_CAP_XEN_HVM: 1566 case KVM_CAP_XEN_HVM:
1353 case KVM_CAP_ADJUST_CLOCK: 1567 case KVM_CAP_ADJUST_CLOCK:
1354 case KVM_CAP_VCPU_EVENTS: 1568 case KVM_CAP_VCPU_EVENTS:
1569 case KVM_CAP_HYPERV:
1570 case KVM_CAP_HYPERV_VAPIC:
1571 case KVM_CAP_HYPERV_SPIN:
1572 case KVM_CAP_PCI_SEGMENT:
1573 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1355 r = 1; 1574 r = 1;
1356 break; 1575 break;
1357 case KVM_CAP_COALESCED_MMIO: 1576 case KVM_CAP_COALESCED_MMIO:
@@ -1465,8 +1684,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1465 1684
1466void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1685void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1467{ 1686{
1468 kvm_x86_ops->vcpu_put(vcpu);
1469 kvm_put_guest_fpu(vcpu); 1687 kvm_put_guest_fpu(vcpu);
1688 kvm_x86_ops->vcpu_put(vcpu);
1470} 1689}
1471 1690
1472static int is_efer_nx(void) 1691static int is_efer_nx(void)
@@ -1531,6 +1750,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1531 cpuid_fix_nx_cap(vcpu); 1750 cpuid_fix_nx_cap(vcpu);
1532 r = 0; 1751 r = 0;
1533 kvm_apic_set_version(vcpu); 1752 kvm_apic_set_version(vcpu);
1753 kvm_x86_ops->cpuid_update(vcpu);
1534 1754
1535out_free: 1755out_free:
1536 vfree(cpuid_entries); 1756 vfree(cpuid_entries);
@@ -1553,6 +1773,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1553 goto out; 1773 goto out;
1554 vcpu->arch.cpuid_nent = cpuid->nent; 1774 vcpu->arch.cpuid_nent = cpuid->nent;
1555 kvm_apic_set_version(vcpu); 1775 kvm_apic_set_version(vcpu);
1776 kvm_x86_ops->cpuid_update(vcpu);
1556 return 0; 1777 return 0;
1557 1778
1558out: 1779out:
@@ -1595,12 +1816,15 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1595 u32 index, int *nent, int maxnent) 1816 u32 index, int *nent, int maxnent)
1596{ 1817{
1597 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 1818 unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1598 unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
1599#ifdef CONFIG_X86_64 1819#ifdef CONFIG_X86_64
1820 unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
1821 ? F(GBPAGES) : 0;
1600 unsigned f_lm = F(LM); 1822 unsigned f_lm = F(LM);
1601#else 1823#else
1824 unsigned f_gbpages = 0;
1602 unsigned f_lm = 0; 1825 unsigned f_lm = 0;
1603#endif 1826#endif
1827 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
1604 1828
1605 /* cpuid 1.edx */ 1829 /* cpuid 1.edx */
1606 const u32 kvm_supported_word0_x86_features = 1830 const u32 kvm_supported_word0_x86_features =
@@ -1620,7 +1844,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1620 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1844 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1621 F(PAT) | F(PSE36) | 0 /* Reserved */ | 1845 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1622 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 1846 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1623 F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | 1847 F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
1624 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1848 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1625 /* cpuid 1.ecx */ 1849 /* cpuid 1.ecx */
1626 const u32 kvm_supported_word4_x86_features = 1850 const u32 kvm_supported_word4_x86_features =
@@ -1867,7 +2091,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1867 return 0; 2091 return 0;
1868 if (mce->status & MCI_STATUS_UC) { 2092 if (mce->status & MCI_STATUS_UC) {
1869 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 2093 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
1870 !(vcpu->arch.cr4 & X86_CR4_MCE)) { 2094 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
1871 printk(KERN_DEBUG "kvm: set_mce: " 2095 printk(KERN_DEBUG "kvm: set_mce: "
1872 "injects mce exception while " 2096 "injects mce exception while "
1873 "previous one is in progress!\n"); 2097 "previous one is in progress!\n");
@@ -2161,14 +2385,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
2161 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 2385 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
2162 return -EINVAL; 2386 return -EINVAL;
2163 2387
2164 down_write(&kvm->slots_lock); 2388 mutex_lock(&kvm->slots_lock);
2165 spin_lock(&kvm->mmu_lock); 2389 spin_lock(&kvm->mmu_lock);
2166 2390
2167 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 2391 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
2168 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 2392 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
2169 2393
2170 spin_unlock(&kvm->mmu_lock); 2394 spin_unlock(&kvm->mmu_lock);
2171 up_write(&kvm->slots_lock); 2395 mutex_unlock(&kvm->slots_lock);
2172 return 0; 2396 return 0;
2173} 2397}
2174 2398
@@ -2177,13 +2401,35 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2177 return kvm->arch.n_alloc_mmu_pages; 2401 return kvm->arch.n_alloc_mmu_pages;
2178} 2402}
2179 2403
2404gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2405{
2406 int i;
2407 struct kvm_mem_alias *alias;
2408 struct kvm_mem_aliases *aliases;
2409
2410 aliases = rcu_dereference(kvm->arch.aliases);
2411
2412 for (i = 0; i < aliases->naliases; ++i) {
2413 alias = &aliases->aliases[i];
2414 if (alias->flags & KVM_ALIAS_INVALID)
2415 continue;
2416 if (gfn >= alias->base_gfn
2417 && gfn < alias->base_gfn + alias->npages)
2418 return alias->target_gfn + gfn - alias->base_gfn;
2419 }
2420 return gfn;
2421}
2422
2180gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 2423gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2181{ 2424{
2182 int i; 2425 int i;
2183 struct kvm_mem_alias *alias; 2426 struct kvm_mem_alias *alias;
2427 struct kvm_mem_aliases *aliases;
2184 2428
2185 for (i = 0; i < kvm->arch.naliases; ++i) { 2429 aliases = rcu_dereference(kvm->arch.aliases);
2186 alias = &kvm->arch.aliases[i]; 2430
2431 for (i = 0; i < aliases->naliases; ++i) {
2432 alias = &aliases->aliases[i];
2187 if (gfn >= alias->base_gfn 2433 if (gfn >= alias->base_gfn
2188 && gfn < alias->base_gfn + alias->npages) 2434 && gfn < alias->base_gfn + alias->npages)
2189 return alias->target_gfn + gfn - alias->base_gfn; 2435 return alias->target_gfn + gfn - alias->base_gfn;
@@ -2201,6 +2447,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
2201{ 2447{
2202 int r, n; 2448 int r, n;
2203 struct kvm_mem_alias *p; 2449 struct kvm_mem_alias *p;
2450 struct kvm_mem_aliases *aliases, *old_aliases;
2204 2451
2205 r = -EINVAL; 2452 r = -EINVAL;
2206 /* General sanity checks */ 2453 /* General sanity checks */
@@ -2217,26 +2464,48 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
2217 < alias->target_phys_addr) 2464 < alias->target_phys_addr)
2218 goto out; 2465 goto out;
2219 2466
2220 down_write(&kvm->slots_lock); 2467 r = -ENOMEM;
2221 spin_lock(&kvm->mmu_lock); 2468 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2469 if (!aliases)
2470 goto out;
2471
2472 mutex_lock(&kvm->slots_lock);
2222 2473
2223 p = &kvm->arch.aliases[alias->slot]; 2474 /* invalidate any gfn reference in case of deletion/shrinking */
2475 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2476 aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
2477 old_aliases = kvm->arch.aliases;
2478 rcu_assign_pointer(kvm->arch.aliases, aliases);
2479 synchronize_srcu_expedited(&kvm->srcu);
2480 kvm_mmu_zap_all(kvm);
2481 kfree(old_aliases);
2482
2483 r = -ENOMEM;
2484 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2485 if (!aliases)
2486 goto out_unlock;
2487
2488 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2489
2490 p = &aliases->aliases[alias->slot];
2224 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 2491 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
2225 p->npages = alias->memory_size >> PAGE_SHIFT; 2492 p->npages = alias->memory_size >> PAGE_SHIFT;
2226 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 2493 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
2494 p->flags &= ~(KVM_ALIAS_INVALID);
2227 2495
2228 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 2496 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
2229 if (kvm->arch.aliases[n - 1].npages) 2497 if (aliases->aliases[n - 1].npages)
2230 break; 2498 break;
2231 kvm->arch.naliases = n; 2499 aliases->naliases = n;
2232 2500
2233 spin_unlock(&kvm->mmu_lock); 2501 old_aliases = kvm->arch.aliases;
2234 kvm_mmu_zap_all(kvm); 2502 rcu_assign_pointer(kvm->arch.aliases, aliases);
2235 2503 synchronize_srcu_expedited(&kvm->srcu);
2236 up_write(&kvm->slots_lock); 2504 kfree(old_aliases);
2237 2505 r = 0;
2238 return 0;
2239 2506
2507out_unlock:
2508 mutex_unlock(&kvm->slots_lock);
2240out: 2509out:
2241 return r; 2510 return r;
2242} 2511}
@@ -2274,18 +2543,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2274 r = 0; 2543 r = 0;
2275 switch (chip->chip_id) { 2544 switch (chip->chip_id) {
2276 case KVM_IRQCHIP_PIC_MASTER: 2545 case KVM_IRQCHIP_PIC_MASTER:
2277 spin_lock(&pic_irqchip(kvm)->lock); 2546 raw_spin_lock(&pic_irqchip(kvm)->lock);
2278 memcpy(&pic_irqchip(kvm)->pics[0], 2547 memcpy(&pic_irqchip(kvm)->pics[0],
2279 &chip->chip.pic, 2548 &chip->chip.pic,
2280 sizeof(struct kvm_pic_state)); 2549 sizeof(struct kvm_pic_state));
2281 spin_unlock(&pic_irqchip(kvm)->lock); 2550 raw_spin_unlock(&pic_irqchip(kvm)->lock);
2282 break; 2551 break;
2283 case KVM_IRQCHIP_PIC_SLAVE: 2552 case KVM_IRQCHIP_PIC_SLAVE:
2284 spin_lock(&pic_irqchip(kvm)->lock); 2553 raw_spin_lock(&pic_irqchip(kvm)->lock);
2285 memcpy(&pic_irqchip(kvm)->pics[1], 2554 memcpy(&pic_irqchip(kvm)->pics[1],
2286 &chip->chip.pic, 2555 &chip->chip.pic,
2287 sizeof(struct kvm_pic_state)); 2556 sizeof(struct kvm_pic_state));
2288 spin_unlock(&pic_irqchip(kvm)->lock); 2557 raw_spin_unlock(&pic_irqchip(kvm)->lock);
2289 break; 2558 break;
2290 case KVM_IRQCHIP_IOAPIC: 2559 case KVM_IRQCHIP_IOAPIC:
2291 r = kvm_set_ioapic(kvm, &chip->chip.ioapic); 2560 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -2365,29 +2634,62 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
2365int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 2634int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2366 struct kvm_dirty_log *log) 2635 struct kvm_dirty_log *log)
2367{ 2636{
2368 int r; 2637 int r, n, i;
2369 int n;
2370 struct kvm_memory_slot *memslot; 2638 struct kvm_memory_slot *memslot;
2371 int is_dirty = 0; 2639 unsigned long is_dirty = 0;
2640 unsigned long *dirty_bitmap = NULL;
2372 2641
2373 down_write(&kvm->slots_lock); 2642 mutex_lock(&kvm->slots_lock);
2374 2643
2375 r = kvm_get_dirty_log(kvm, log, &is_dirty); 2644 r = -EINVAL;
2376 if (r) 2645 if (log->slot >= KVM_MEMORY_SLOTS)
2646 goto out;
2647
2648 memslot = &kvm->memslots->memslots[log->slot];
2649 r = -ENOENT;
2650 if (!memslot->dirty_bitmap)
2651 goto out;
2652
2653 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
2654
2655 r = -ENOMEM;
2656 dirty_bitmap = vmalloc(n);
2657 if (!dirty_bitmap)
2377 goto out; 2658 goto out;
2659 memset(dirty_bitmap, 0, n);
2660
2661 for (i = 0; !is_dirty && i < n/sizeof(long); i++)
2662 is_dirty = memslot->dirty_bitmap[i];
2378 2663
2379 /* If nothing is dirty, don't bother messing with page tables. */ 2664 /* If nothing is dirty, don't bother messing with page tables. */
2380 if (is_dirty) { 2665 if (is_dirty) {
2666 struct kvm_memslots *slots, *old_slots;
2667
2381 spin_lock(&kvm->mmu_lock); 2668 spin_lock(&kvm->mmu_lock);
2382 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2669 kvm_mmu_slot_remove_write_access(kvm, log->slot);
2383 spin_unlock(&kvm->mmu_lock); 2670 spin_unlock(&kvm->mmu_lock);
2384 memslot = &kvm->memslots[log->slot]; 2671
2385 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 2672 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
2386 memset(memslot->dirty_bitmap, 0, n); 2673 if (!slots)
2674 goto out_free;
2675
2676 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
2677 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
2678
2679 old_slots = kvm->memslots;
2680 rcu_assign_pointer(kvm->memslots, slots);
2681 synchronize_srcu_expedited(&kvm->srcu);
2682 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
2683 kfree(old_slots);
2387 } 2684 }
2685
2388 r = 0; 2686 r = 0;
2687 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
2688 r = -EFAULT;
2689out_free:
2690 vfree(dirty_bitmap);
2389out: 2691out:
2390 up_write(&kvm->slots_lock); 2692 mutex_unlock(&kvm->slots_lock);
2391 return r; 2693 return r;
2392} 2694}
2393 2695
@@ -2470,6 +2772,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
2470 if (vpic) { 2772 if (vpic) {
2471 r = kvm_ioapic_init(kvm); 2773 r = kvm_ioapic_init(kvm);
2472 if (r) { 2774 if (r) {
2775 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
2776 &vpic->dev);
2473 kfree(vpic); 2777 kfree(vpic);
2474 goto create_irqchip_unlock; 2778 goto create_irqchip_unlock;
2475 } 2779 }
@@ -2481,10 +2785,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
2481 r = kvm_setup_default_irq_routing(kvm); 2785 r = kvm_setup_default_irq_routing(kvm);
2482 if (r) { 2786 if (r) {
2483 mutex_lock(&kvm->irq_lock); 2787 mutex_lock(&kvm->irq_lock);
2484 kfree(kvm->arch.vpic); 2788 kvm_ioapic_destroy(kvm);
2485 kfree(kvm->arch.vioapic); 2789 kvm_destroy_pic(kvm);
2486 kvm->arch.vpic = NULL;
2487 kvm->arch.vioapic = NULL;
2488 mutex_unlock(&kvm->irq_lock); 2790 mutex_unlock(&kvm->irq_lock);
2489 } 2791 }
2490 create_irqchip_unlock: 2792 create_irqchip_unlock:
@@ -2500,7 +2802,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2500 sizeof(struct kvm_pit_config))) 2802 sizeof(struct kvm_pit_config)))
2501 goto out; 2803 goto out;
2502 create_pit: 2804 create_pit:
2503 down_write(&kvm->slots_lock); 2805 mutex_lock(&kvm->slots_lock);
2504 r = -EEXIST; 2806 r = -EEXIST;
2505 if (kvm->arch.vpit) 2807 if (kvm->arch.vpit)
2506 goto create_pit_unlock; 2808 goto create_pit_unlock;
@@ -2509,7 +2811,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2509 if (kvm->arch.vpit) 2811 if (kvm->arch.vpit)
2510 r = 0; 2812 r = 0;
2511 create_pit_unlock: 2813 create_pit_unlock:
2512 up_write(&kvm->slots_lock); 2814 mutex_unlock(&kvm->slots_lock);
2513 break; 2815 break;
2514 case KVM_IRQ_LINE_STATUS: 2816 case KVM_IRQ_LINE_STATUS:
2515 case KVM_IRQ_LINE: { 2817 case KVM_IRQ_LINE: {
@@ -2726,7 +3028,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
2726 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) 3028 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
2727 return 0; 3029 return 0;
2728 3030
2729 return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); 3031 return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
2730} 3032}
2731 3033
2732static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 3034static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
@@ -2735,17 +3037,44 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
2735 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) 3037 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
2736 return 0; 3038 return 0;
2737 3039
2738 return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); 3040 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
2739} 3041}
2740 3042
2741static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 3043gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
2742 struct kvm_vcpu *vcpu) 3044{
3045 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3046 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3047}
3048
3049 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3050{
3051 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3052 access |= PFERR_FETCH_MASK;
3053 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3054}
3055
3056gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3057{
3058 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3059 access |= PFERR_WRITE_MASK;
3060 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3061}
3062
3063/* uses this to access any guest's mapped memory without checking CPL */
3064gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3065{
3066 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
3067}
3068
3069static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3070 struct kvm_vcpu *vcpu, u32 access,
3071 u32 *error)
2743{ 3072{
2744 void *data = val; 3073 void *data = val;
2745 int r = X86EMUL_CONTINUE; 3074 int r = X86EMUL_CONTINUE;
2746 3075
2747 while (bytes) { 3076 while (bytes) {
2748 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3077 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
2749 unsigned offset = addr & (PAGE_SIZE-1); 3078 unsigned offset = addr & (PAGE_SIZE-1);
2750 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3079 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
2751 int ret; 3080 int ret;
@@ -2768,14 +3097,37 @@ out:
2768 return r; 3097 return r;
2769} 3098}
2770 3099
3100/* used for instruction fetching */
3101static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
3102 struct kvm_vcpu *vcpu, u32 *error)
3103{
3104 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3105 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3106 access | PFERR_FETCH_MASK, error);
3107}
3108
3109static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
3110 struct kvm_vcpu *vcpu, u32 *error)
3111{
3112 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3113 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3114 error);
3115}
3116
3117static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
3118 struct kvm_vcpu *vcpu, u32 *error)
3119{
3120 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
3121}
3122
2771static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 3123static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2772 struct kvm_vcpu *vcpu) 3124 struct kvm_vcpu *vcpu, u32 *error)
2773{ 3125{
2774 void *data = val; 3126 void *data = val;
2775 int r = X86EMUL_CONTINUE; 3127 int r = X86EMUL_CONTINUE;
2776 3128
2777 while (bytes) { 3129 while (bytes) {
2778 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3130 gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
2779 unsigned offset = addr & (PAGE_SIZE-1); 3131 unsigned offset = addr & (PAGE_SIZE-1);
2780 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3132 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
2781 int ret; 3133 int ret;
@@ -2805,6 +3157,7 @@ static int emulator_read_emulated(unsigned long addr,
2805 struct kvm_vcpu *vcpu) 3157 struct kvm_vcpu *vcpu)
2806{ 3158{
2807 gpa_t gpa; 3159 gpa_t gpa;
3160 u32 error_code;
2808 3161
2809 if (vcpu->mmio_read_completed) { 3162 if (vcpu->mmio_read_completed) {
2810 memcpy(val, vcpu->mmio_data, bytes); 3163 memcpy(val, vcpu->mmio_data, bytes);
@@ -2814,17 +3167,20 @@ static int emulator_read_emulated(unsigned long addr,
2814 return X86EMUL_CONTINUE; 3167 return X86EMUL_CONTINUE;
2815 } 3168 }
2816 3169
2817 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3170 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
3171
3172 if (gpa == UNMAPPED_GVA) {
3173 kvm_inject_page_fault(vcpu, addr, error_code);
3174 return X86EMUL_PROPAGATE_FAULT;
3175 }
2818 3176
2819 /* For APIC access vmexit */ 3177 /* For APIC access vmexit */
2820 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3178 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2821 goto mmio; 3179 goto mmio;
2822 3180
2823 if (kvm_read_guest_virt(addr, val, bytes, vcpu) 3181 if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
2824 == X86EMUL_CONTINUE) 3182 == X86EMUL_CONTINUE)
2825 return X86EMUL_CONTINUE; 3183 return X86EMUL_CONTINUE;
2826 if (gpa == UNMAPPED_GVA)
2827 return X86EMUL_PROPAGATE_FAULT;
2828 3184
2829mmio: 3185mmio:
2830 /* 3186 /*
@@ -2863,11 +3219,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,
2863 struct kvm_vcpu *vcpu) 3219 struct kvm_vcpu *vcpu)
2864{ 3220{
2865 gpa_t gpa; 3221 gpa_t gpa;
3222 u32 error_code;
2866 3223
2867 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3224 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
2868 3225
2869 if (gpa == UNMAPPED_GVA) { 3226 if (gpa == UNMAPPED_GVA) {
2870 kvm_inject_page_fault(vcpu, addr, 2); 3227 kvm_inject_page_fault(vcpu, addr, error_code);
2871 return X86EMUL_PROPAGATE_FAULT; 3228 return X86EMUL_PROPAGATE_FAULT;
2872 } 3229 }
2873 3230
@@ -2931,7 +3288,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
2931 char *kaddr; 3288 char *kaddr;
2932 u64 val; 3289 u64 val;
2933 3290
2934 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3291 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
2935 3292
2936 if (gpa == UNMAPPED_GVA || 3293 if (gpa == UNMAPPED_GVA ||
2937 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3294 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -2968,35 +3325,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2968 3325
2969int emulate_clts(struct kvm_vcpu *vcpu) 3326int emulate_clts(struct kvm_vcpu *vcpu)
2970{ 3327{
2971 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 3328 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
3329 kvm_x86_ops->fpu_activate(vcpu);
2972 return X86EMUL_CONTINUE; 3330 return X86EMUL_CONTINUE;
2973} 3331}
2974 3332
2975int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3333int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2976{ 3334{
2977 struct kvm_vcpu *vcpu = ctxt->vcpu; 3335 return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);
2978
2979 switch (dr) {
2980 case 0 ... 3:
2981 *dest = kvm_x86_ops->get_dr(vcpu, dr);
2982 return X86EMUL_CONTINUE;
2983 default:
2984 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2985 return X86EMUL_UNHANDLEABLE;
2986 }
2987} 3336}
2988 3337
2989int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3338int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2990{ 3339{
2991 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3340 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2992 int exception;
2993 3341
2994 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 3342 return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);
2995 if (exception) {
2996 /* FIXME: better handling */
2997 return X86EMUL_UNHANDLEABLE;
2998 }
2999 return X86EMUL_CONTINUE;
3000} 3343}
3001 3344
3002void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3345void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -3010,7 +3353,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
3010 3353
3011 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 3354 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
3012 3355
3013 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); 3356 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
3014 3357
3015 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 3358 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
3016 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 3359 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
@@ -3018,7 +3361,8 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
3018EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 3361EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
3019 3362
3020static struct x86_emulate_ops emulate_ops = { 3363static struct x86_emulate_ops emulate_ops = {
3021 .read_std = kvm_read_guest_virt, 3364 .read_std = kvm_read_guest_virt_system,
3365 .fetch = kvm_fetch_guest_virt,
3022 .read_emulated = emulator_read_emulated, 3366 .read_emulated = emulator_read_emulated,
3023 .write_emulated = emulator_write_emulated, 3367 .write_emulated = emulator_write_emulated,
3024 .cmpxchg_emulated = emulator_cmpxchg_emulated, 3368 .cmpxchg_emulated = emulator_cmpxchg_emulated,
@@ -3061,8 +3405,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3061 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3405 vcpu->arch.emulate_ctxt.vcpu = vcpu;
3062 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); 3406 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
3063 vcpu->arch.emulate_ctxt.mode = 3407 vcpu->arch.emulate_ctxt.mode =
3408 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
3064 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3409 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
3065 ? X86EMUL_MODE_REAL : cs_l 3410 ? X86EMUL_MODE_VM86 : cs_l
3066 ? X86EMUL_MODE_PROT64 : cs_db 3411 ? X86EMUL_MODE_PROT64 : cs_db
3067 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3412 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3068 3413
@@ -3154,12 +3499,17 @@ static int pio_copy_data(struct kvm_vcpu *vcpu)
3154 gva_t q = vcpu->arch.pio.guest_gva; 3499 gva_t q = vcpu->arch.pio.guest_gva;
3155 unsigned bytes; 3500 unsigned bytes;
3156 int ret; 3501 int ret;
3502 u32 error_code;
3157 3503
3158 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 3504 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
3159 if (vcpu->arch.pio.in) 3505 if (vcpu->arch.pio.in)
3160 ret = kvm_write_guest_virt(q, p, bytes, vcpu); 3506 ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
3161 else 3507 else
3162 ret = kvm_read_guest_virt(q, p, bytes, vcpu); 3508 ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
3509
3510 if (ret == X86EMUL_PROPAGATE_FAULT)
3511 kvm_inject_page_fault(vcpu, q, error_code);
3512
3163 return ret; 3513 return ret;
3164} 3514}
3165 3515
@@ -3180,7 +3530,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
3180 if (io->in) { 3530 if (io->in) {
3181 r = pio_copy_data(vcpu); 3531 r = pio_copy_data(vcpu);
3182 if (r) 3532 if (r)
3183 return r; 3533 goto out;
3184 } 3534 }
3185 3535
3186 delta = 1; 3536 delta = 1;
@@ -3207,7 +3557,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
3207 kvm_register_write(vcpu, VCPU_REGS_RSI, val); 3557 kvm_register_write(vcpu, VCPU_REGS_RSI, val);
3208 } 3558 }
3209 } 3559 }
3210 3560out:
3211 io->count -= io->cur_count; 3561 io->count -= io->cur_count;
3212 io->cur_count = 0; 3562 io->cur_count = 0;
3213 3563
@@ -3220,11 +3570,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3220 int r; 3570 int r;
3221 3571
3222 if (vcpu->arch.pio.in) 3572 if (vcpu->arch.pio.in)
3223 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 3573 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
3224 vcpu->arch.pio.size, pd); 3574 vcpu->arch.pio.size, pd);
3225 else 3575 else
3226 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 3576 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3227 vcpu->arch.pio.size, pd); 3577 vcpu->arch.pio.port, vcpu->arch.pio.size,
3578 pd);
3228 return r; 3579 return r;
3229} 3580}
3230 3581
@@ -3235,7 +3586,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
3235 int i, r = 0; 3586 int i, r = 0;
3236 3587
3237 for (i = 0; i < io->cur_count; i++) { 3588 for (i = 0; i < io->cur_count; i++) {
3238 if (kvm_io_bus_write(&vcpu->kvm->pio_bus, 3589 if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3239 io->port, io->size, pd)) { 3590 io->port, io->size, pd)) {
3240 r = -EOPNOTSUPP; 3591 r = -EOPNOTSUPP;
3241 break; 3592 break;
@@ -3249,6 +3600,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
3249{ 3600{
3250 unsigned long val; 3601 unsigned long val;
3251 3602
3603 trace_kvm_pio(!in, port, size, 1);
3604
3252 vcpu->run->exit_reason = KVM_EXIT_IO; 3605 vcpu->run->exit_reason = KVM_EXIT_IO;
3253 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3606 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3254 vcpu->run->io.size = vcpu->arch.pio.size = size; 3607 vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -3260,11 +3613,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
3260 vcpu->arch.pio.down = 0; 3613 vcpu->arch.pio.down = 0;
3261 vcpu->arch.pio.rep = 0; 3614 vcpu->arch.pio.rep = 0;
3262 3615
3263 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, 3616 if (!vcpu->arch.pio.in) {
3264 size, 1); 3617 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3265 3618 memcpy(vcpu->arch.pio_data, &val, 4);
3266 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 3619 }
3267 memcpy(vcpu->arch.pio_data, &val, 4);
3268 3620
3269 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 3621 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3270 complete_pio(vcpu); 3622 complete_pio(vcpu);
@@ -3281,6 +3633,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3281 unsigned now, in_page; 3633 unsigned now, in_page;
3282 int ret = 0; 3634 int ret = 0;
3283 3635
3636 trace_kvm_pio(!in, port, size, count);
3637
3284 vcpu->run->exit_reason = KVM_EXIT_IO; 3638 vcpu->run->exit_reason = KVM_EXIT_IO;
3285 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3639 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3286 vcpu->run->io.size = vcpu->arch.pio.size = size; 3640 vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -3292,9 +3646,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3292 vcpu->arch.pio.down = down; 3646 vcpu->arch.pio.down = down;
3293 vcpu->arch.pio.rep = rep; 3647 vcpu->arch.pio.rep = rep;
3294 3648
3295 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
3296 size, count);
3297
3298 if (!count) { 3649 if (!count) {
3299 kvm_x86_ops->skip_emulated_instruction(vcpu); 3650 kvm_x86_ops->skip_emulated_instruction(vcpu);
3300 return 1; 3651 return 1;
@@ -3326,10 +3677,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3326 if (!vcpu->arch.pio.in) { 3677 if (!vcpu->arch.pio.in) {
3327 /* string PIO write */ 3678 /* string PIO write */
3328 ret = pio_copy_data(vcpu); 3679 ret = pio_copy_data(vcpu);
3329 if (ret == X86EMUL_PROPAGATE_FAULT) { 3680 if (ret == X86EMUL_PROPAGATE_FAULT)
3330 kvm_inject_gp(vcpu, 0);
3331 return 1; 3681 return 1;
3332 }
3333 if (ret == 0 && !pio_string_write(vcpu)) { 3682 if (ret == 0 && !pio_string_write(vcpu)) {
3334 complete_pio(vcpu); 3683 complete_pio(vcpu);
3335 if (vcpu->arch.pio.count == 0) 3684 if (vcpu->arch.pio.count == 0)
@@ -3488,11 +3837,76 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
3488 return a0 | ((gpa_t)a1 << 32); 3837 return a0 | ((gpa_t)a1 << 32);
3489} 3838}
3490 3839
3840int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
3841{
3842 u64 param, ingpa, outgpa, ret;
3843 uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
3844 bool fast, longmode;
3845 int cs_db, cs_l;
3846
3847 /*
3848 * hypercall generates UD from non zero cpl and real mode
3849 * per HYPER-V spec
3850 */
3851 if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
3852 kvm_queue_exception(vcpu, UD_VECTOR);
3853 return 0;
3854 }
3855
3856 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3857 longmode = is_long_mode(vcpu) && cs_l == 1;
3858
3859 if (!longmode) {
3860 param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
3861 (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
3862 ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
3863 (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
3864 outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
3865 (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
3866 }
3867#ifdef CONFIG_X86_64
3868 else {
3869 param = kvm_register_read(vcpu, VCPU_REGS_RCX);
3870 ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
3871 outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
3872 }
3873#endif
3874
3875 code = param & 0xffff;
3876 fast = (param >> 16) & 0x1;
3877 rep_cnt = (param >> 32) & 0xfff;
3878 rep_idx = (param >> 48) & 0xfff;
3879
3880 trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
3881
3882 switch (code) {
3883 case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
3884 kvm_vcpu_on_spin(vcpu);
3885 break;
3886 default:
3887 res = HV_STATUS_INVALID_HYPERCALL_CODE;
3888 break;
3889 }
3890
3891 ret = res | (((u64)rep_done & 0xfff) << 32);
3892 if (longmode) {
3893 kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
3894 } else {
3895 kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
3896 kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
3897 }
3898
3899 return 1;
3900}
3901
3491int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 3902int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
3492{ 3903{
3493 unsigned long nr, a0, a1, a2, a3, ret; 3904 unsigned long nr, a0, a1, a2, a3, ret;
3494 int r = 1; 3905 int r = 1;
3495 3906
3907 if (kvm_hv_hypercall_enabled(vcpu->kvm))
3908 return kvm_hv_hypercall(vcpu);
3909
3496 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 3910 nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
3497 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 3911 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
3498 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 3912 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -3535,10 +3949,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
3535int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 3949int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3536{ 3950{
3537 char instruction[3]; 3951 char instruction[3];
3538 int ret = 0;
3539 unsigned long rip = kvm_rip_read(vcpu); 3952 unsigned long rip = kvm_rip_read(vcpu);
3540 3953
3541
3542 /* 3954 /*
3543 * Blow out the MMU to ensure that no other VCPU has an active mapping 3955 * Blow out the MMU to ensure that no other VCPU has an active mapping
3544 * to ensure that the updated hypercall appears atomically across all 3956 * to ensure that the updated hypercall appears atomically across all
@@ -3547,11 +3959,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3547 kvm_mmu_zap_all(vcpu->kvm); 3959 kvm_mmu_zap_all(vcpu->kvm);
3548 3960
3549 kvm_x86_ops->patch_hypercall(vcpu, instruction); 3961 kvm_x86_ops->patch_hypercall(vcpu, instruction);
3550 if (emulator_write_emulated(rip, instruction, 3, vcpu)
3551 != X86EMUL_CONTINUE)
3552 ret = -EFAULT;
3553 3962
3554 return ret; 3963 return emulator_write_emulated(rip, instruction, 3, vcpu);
3555} 3964}
3556 3965
3557static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3966static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -3584,10 +3993,9 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
3584{ 3993{
3585 unsigned long value; 3994 unsigned long value;
3586 3995
3587 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3588 switch (cr) { 3996 switch (cr) {
3589 case 0: 3997 case 0:
3590 value = vcpu->arch.cr0; 3998 value = kvm_read_cr0(vcpu);
3591 break; 3999 break;
3592 case 2: 4000 case 2:
3593 value = vcpu->arch.cr2; 4001 value = vcpu->arch.cr2;
@@ -3596,7 +4004,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
3596 value = vcpu->arch.cr3; 4004 value = vcpu->arch.cr3;
3597 break; 4005 break;
3598 case 4: 4006 case 4:
3599 value = vcpu->arch.cr4; 4007 value = kvm_read_cr4(vcpu);
3600 break; 4008 break;
3601 case 8: 4009 case 8:
3602 value = kvm_get_cr8(vcpu); 4010 value = kvm_get_cr8(vcpu);
@@ -3614,7 +4022,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3614{ 4022{
3615 switch (cr) { 4023 switch (cr) {
3616 case 0: 4024 case 0:
3617 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 4025 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
3618 *rflags = kvm_get_rflags(vcpu); 4026 *rflags = kvm_get_rflags(vcpu);
3619 break; 4027 break;
3620 case 2: 4028 case 2:
@@ -3624,7 +4032,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3624 kvm_set_cr3(vcpu, val); 4032 kvm_set_cr3(vcpu, val);
3625 break; 4033 break;
3626 case 4: 4034 case 4:
3627 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 4035 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3628 break; 4036 break;
3629 case 8: 4037 case 8:
3630 kvm_set_cr8(vcpu, val & 0xfUL); 4038 kvm_set_cr8(vcpu, val & 0xfUL);
@@ -3691,6 +4099,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3691 } 4099 }
3692 return best; 4100 return best;
3693} 4101}
4102EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
3694 4103
3695int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) 4104int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3696{ 4105{
@@ -3774,14 +4183,15 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
3774static void vapic_exit(struct kvm_vcpu *vcpu) 4183static void vapic_exit(struct kvm_vcpu *vcpu)
3775{ 4184{
3776 struct kvm_lapic *apic = vcpu->arch.apic; 4185 struct kvm_lapic *apic = vcpu->arch.apic;
4186 int idx;
3777 4187
3778 if (!apic || !apic->vapic_addr) 4188 if (!apic || !apic->vapic_addr)
3779 return; 4189 return;
3780 4190
3781 down_read(&vcpu->kvm->slots_lock); 4191 idx = srcu_read_lock(&vcpu->kvm->srcu);
3782 kvm_release_page_dirty(apic->vapic_page); 4192 kvm_release_page_dirty(apic->vapic_page);
3783 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 4193 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3784 up_read(&vcpu->kvm->slots_lock); 4194 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3785} 4195}
3786 4196
3787static void update_cr8_intercept(struct kvm_vcpu *vcpu) 4197static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -3877,12 +4287,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3877 r = 0; 4287 r = 0;
3878 goto out; 4288 goto out;
3879 } 4289 }
4290 if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
4291 vcpu->fpu_active = 0;
4292 kvm_x86_ops->fpu_deactivate(vcpu);
4293 }
3880 } 4294 }
3881 4295
3882 preempt_disable(); 4296 preempt_disable();
3883 4297
3884 kvm_x86_ops->prepare_guest_switch(vcpu); 4298 kvm_x86_ops->prepare_guest_switch(vcpu);
3885 kvm_load_guest_fpu(vcpu); 4299 if (vcpu->fpu_active)
4300 kvm_load_guest_fpu(vcpu);
3886 4301
3887 local_irq_disable(); 4302 local_irq_disable();
3888 4303
@@ -3910,7 +4325,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3910 kvm_lapic_sync_to_vapic(vcpu); 4325 kvm_lapic_sync_to_vapic(vcpu);
3911 } 4326 }
3912 4327
3913 up_read(&vcpu->kvm->slots_lock); 4328 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
3914 4329
3915 kvm_guest_enter(); 4330 kvm_guest_enter();
3916 4331
@@ -3952,7 +4367,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3952 4367
3953 preempt_enable(); 4368 preempt_enable();
3954 4369
3955 down_read(&vcpu->kvm->slots_lock); 4370 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
3956 4371
3957 /* 4372 /*
3958 * Profile KVM exit RIPs: 4373 * Profile KVM exit RIPs:
@@ -3974,6 +4389,7 @@ out:
3974static int __vcpu_run(struct kvm_vcpu *vcpu) 4389static int __vcpu_run(struct kvm_vcpu *vcpu)
3975{ 4390{
3976 int r; 4391 int r;
4392 struct kvm *kvm = vcpu->kvm;
3977 4393
3978 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 4394 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
3979 pr_debug("vcpu %d received sipi with vector # %x\n", 4395 pr_debug("vcpu %d received sipi with vector # %x\n",
@@ -3985,7 +4401,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
3985 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4401 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3986 } 4402 }
3987 4403
3988 down_read(&vcpu->kvm->slots_lock); 4404 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
3989 vapic_enter(vcpu); 4405 vapic_enter(vcpu);
3990 4406
3991 r = 1; 4407 r = 1;
@@ -3993,9 +4409,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
3993 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 4409 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3994 r = vcpu_enter_guest(vcpu); 4410 r = vcpu_enter_guest(vcpu);
3995 else { 4411 else {
3996 up_read(&vcpu->kvm->slots_lock); 4412 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
3997 kvm_vcpu_block(vcpu); 4413 kvm_vcpu_block(vcpu);
3998 down_read(&vcpu->kvm->slots_lock); 4414 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
3999 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 4415 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
4000 { 4416 {
4001 switch(vcpu->arch.mp_state) { 4417 switch(vcpu->arch.mp_state) {
@@ -4030,13 +4446,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4030 ++vcpu->stat.signal_exits; 4446 ++vcpu->stat.signal_exits;
4031 } 4447 }
4032 if (need_resched()) { 4448 if (need_resched()) {
4033 up_read(&vcpu->kvm->slots_lock); 4449 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4034 kvm_resched(vcpu); 4450 kvm_resched(vcpu);
4035 down_read(&vcpu->kvm->slots_lock); 4451 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
4036 } 4452 }
4037 } 4453 }
4038 4454
4039 up_read(&vcpu->kvm->slots_lock); 4455 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4040 post_kvm_run_save(vcpu); 4456 post_kvm_run_save(vcpu);
4041 4457
4042 vapic_exit(vcpu); 4458 vapic_exit(vcpu);
@@ -4075,10 +4491,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4075 vcpu->mmio_read_completed = 1; 4491 vcpu->mmio_read_completed = 1;
4076 vcpu->mmio_needed = 0; 4492 vcpu->mmio_needed = 0;
4077 4493
4078 down_read(&vcpu->kvm->slots_lock); 4494 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4079 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, 4495 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
4080 EMULTYPE_NO_DECODE); 4496 EMULTYPE_NO_DECODE);
4081 up_read(&vcpu->kvm->slots_lock); 4497 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4082 if (r == EMULATE_DO_MMIO) { 4498 if (r == EMULATE_DO_MMIO) {
4083 /* 4499 /*
4084 * Read-modify-write. Back to userspace. 4500 * Read-modify-write. Back to userspace.
@@ -4205,13 +4621,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4205 sregs->gdt.limit = dt.limit; 4621 sregs->gdt.limit = dt.limit;
4206 sregs->gdt.base = dt.base; 4622 sregs->gdt.base = dt.base;
4207 4623
4208 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 4624 sregs->cr0 = kvm_read_cr0(vcpu);
4209 sregs->cr0 = vcpu->arch.cr0;
4210 sregs->cr2 = vcpu->arch.cr2; 4625 sregs->cr2 = vcpu->arch.cr2;
4211 sregs->cr3 = vcpu->arch.cr3; 4626 sregs->cr3 = vcpu->arch.cr3;
4212 sregs->cr4 = vcpu->arch.cr4; 4627 sregs->cr4 = kvm_read_cr4(vcpu);
4213 sregs->cr8 = kvm_get_cr8(vcpu); 4628 sregs->cr8 = kvm_get_cr8(vcpu);
4214 sregs->efer = vcpu->arch.shadow_efer; 4629 sregs->efer = vcpu->arch.efer;
4215 sregs->apic_base = kvm_get_apic_base(vcpu); 4630 sregs->apic_base = kvm_get_apic_base(vcpu);
4216 4631
4217 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); 4632 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
@@ -4299,14 +4714,23 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4299{ 4714{
4300 struct descriptor_table dtable; 4715 struct descriptor_table dtable;
4301 u16 index = selector >> 3; 4716 u16 index = selector >> 3;
4717 int ret;
4718 u32 err;
4719 gva_t addr;
4302 4720
4303 get_segment_descriptor_dtable(vcpu, selector, &dtable); 4721 get_segment_descriptor_dtable(vcpu, selector, &dtable);
4304 4722
4305 if (dtable.limit < index * 8 + 7) { 4723 if (dtable.limit < index * 8 + 7) {
4306 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 4724 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
4307 return 1; 4725 return X86EMUL_PROPAGATE_FAULT;
4308 } 4726 }
4309 return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4727 addr = dtable.base + index * 8;
4728 ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
4729 vcpu, &err);
4730 if (ret == X86EMUL_PROPAGATE_FAULT)
4731 kvm_inject_page_fault(vcpu, addr, err);
4732
4733 return ret;
4310} 4734}
4311 4735
4312/* allowed just for 8 bytes segments */ 4736/* allowed just for 8 bytes segments */
@@ -4320,15 +4744,23 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4320 4744
4321 if (dtable.limit < index * 8 + 7) 4745 if (dtable.limit < index * 8 + 7)
4322 return 1; 4746 return 1;
4323 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4747 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
4748}
4749
4750static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
4751 struct desc_struct *seg_desc)
4752{
4753 u32 base_addr = get_desc_base(seg_desc);
4754
4755 return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
4324} 4756}
4325 4757
4326static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, 4758static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
4327 struct desc_struct *seg_desc) 4759 struct desc_struct *seg_desc)
4328{ 4760{
4329 u32 base_addr = get_desc_base(seg_desc); 4761 u32 base_addr = get_desc_base(seg_desc);
4330 4762
4331 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 4763 return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
4332} 4764}
4333 4765
4334static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 4766static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
@@ -4339,18 +4771,6 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
4339 return kvm_seg.selector; 4771 return kvm_seg.selector;
4340} 4772}
4341 4773
4342static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
4343 u16 selector,
4344 struct kvm_segment *kvm_seg)
4345{
4346 struct desc_struct seg_desc;
4347
4348 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
4349 return 1;
4350 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
4351 return 0;
4352}
4353
4354static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) 4774static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
4355{ 4775{
4356 struct kvm_segment segvar = { 4776 struct kvm_segment segvar = {
@@ -4368,7 +4788,7 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se
4368 .unusable = 0, 4788 .unusable = 0,
4369 }; 4789 };
4370 kvm_x86_ops->set_segment(vcpu, &segvar, seg); 4790 kvm_x86_ops->set_segment(vcpu, &segvar, seg);
4371 return 0; 4791 return X86EMUL_CONTINUE;
4372} 4792}
4373 4793
4374static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) 4794static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
@@ -4378,24 +4798,112 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4378 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); 4798 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4379} 4799}
4380 4800
4381int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4801int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
4382 int type_bits, int seg)
4383{ 4802{
4384 struct kvm_segment kvm_seg; 4803 struct kvm_segment kvm_seg;
4804 struct desc_struct seg_desc;
4805 u8 dpl, rpl, cpl;
4806 unsigned err_vec = GP_VECTOR;
4807 u32 err_code = 0;
4808 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
4809 int ret;
4385 4810
4386 if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) 4811 if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
4387 return kvm_load_realmode_segment(vcpu, selector, seg); 4812 return kvm_load_realmode_segment(vcpu, selector, seg);
4388 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
4389 return 1;
4390 kvm_seg.type |= type_bits;
4391 4813
4392 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 4814 /* NULL selector is not valid for TR, CS and SS */
4393 seg != VCPU_SREG_LDTR) 4815 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
4394 if (!kvm_seg.s) 4816 && null_selector)
4395 kvm_seg.unusable = 1; 4817 goto exception;
4818
4819 /* TR should be in GDT only */
4820 if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
4821 goto exception;
4822
4823 ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
4824 if (ret)
4825 return ret;
4826
4827 seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
4828
4829 if (null_selector) { /* for NULL selector skip all following checks */
4830 kvm_seg.unusable = 1;
4831 goto load;
4832 }
4833
4834 err_code = selector & 0xfffc;
4835 err_vec = GP_VECTOR;
4396 4836
4837 /* can't load system descriptor into segment selecor */
4838 if (seg <= VCPU_SREG_GS && !kvm_seg.s)
4839 goto exception;
4840
4841 if (!kvm_seg.present) {
4842 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
4843 goto exception;
4844 }
4845
4846 rpl = selector & 3;
4847 dpl = kvm_seg.dpl;
4848 cpl = kvm_x86_ops->get_cpl(vcpu);
4849
4850 switch (seg) {
4851 case VCPU_SREG_SS:
4852 /*
4853 * segment is not a writable data segment or segment
4854 * selector's RPL != CPL or segment selector's RPL != CPL
4855 */
4856 if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
4857 goto exception;
4858 break;
4859 case VCPU_SREG_CS:
4860 if (!(kvm_seg.type & 8))
4861 goto exception;
4862
4863 if (kvm_seg.type & 4) {
4864 /* conforming */
4865 if (dpl > cpl)
4866 goto exception;
4867 } else {
4868 /* nonconforming */
4869 if (rpl > cpl || dpl != cpl)
4870 goto exception;
4871 }
4872 /* CS(RPL) <- CPL */
4873 selector = (selector & 0xfffc) | cpl;
4874 break;
4875 case VCPU_SREG_TR:
4876 if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
4877 goto exception;
4878 break;
4879 case VCPU_SREG_LDTR:
4880 if (kvm_seg.s || kvm_seg.type != 2)
4881 goto exception;
4882 break;
4883 default: /* DS, ES, FS, or GS */
4884 /*
4885 * segment is not a data or readable code segment or
4886 * ((segment is a data or nonconforming code segment)
4887 * and (both RPL and CPL > DPL))
4888 */
4889 if ((kvm_seg.type & 0xa) == 0x8 ||
4890 (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
4891 goto exception;
4892 break;
4893 }
4894
4895 if (!kvm_seg.unusable && kvm_seg.s) {
4896 /* mark segment as accessed */
4897 kvm_seg.type |= 1;
4898 seg_desc.type |= 1;
4899 save_guest_segment_descriptor(vcpu, selector, &seg_desc);
4900 }
4901load:
4397 kvm_set_segment(vcpu, &kvm_seg, seg); 4902 kvm_set_segment(vcpu, &kvm_seg, seg);
4398 return 0; 4903 return X86EMUL_CONTINUE;
4904exception:
4905 kvm_queue_exception_e(vcpu, err_vec, err_code);
4906 return X86EMUL_PROPAGATE_FAULT;
4399} 4907}
4400 4908
4401static void save_state_to_tss32(struct kvm_vcpu *vcpu, 4909static void save_state_to_tss32(struct kvm_vcpu *vcpu,
@@ -4421,6 +4929,14 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4421 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4929 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4422} 4930}
4423 4931
4932static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
4933{
4934 struct kvm_segment kvm_seg;
4935 kvm_get_segment(vcpu, &kvm_seg, seg);
4936 kvm_seg.selector = sel;
4937 kvm_set_segment(vcpu, &kvm_seg, seg);
4938}
4939
4424static int load_state_from_tss32(struct kvm_vcpu *vcpu, 4940static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4425 struct tss_segment_32 *tss) 4941 struct tss_segment_32 *tss)
4426{ 4942{
@@ -4438,25 +4954,41 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4438 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); 4954 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
4439 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); 4955 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
4440 4956
4441 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 4957 /*
4958 * SDM says that segment selectors are loaded before segment
4959 * descriptors
4960 */
4961 kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
4962 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
4963 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
4964 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
4965 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
4966 kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
4967 kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
4968
4969 /*
4970 * Now load segment descriptors. If fault happenes at this stage
4971 * it is handled in a context of new task
4972 */
4973 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
4442 return 1; 4974 return 1;
4443 4975
4444 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 4976 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
4445 return 1; 4977 return 1;
4446 4978
4447 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 4979 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
4448 return 1; 4980 return 1;
4449 4981
4450 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 4982 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
4451 return 1; 4983 return 1;
4452 4984
4453 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 4985 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
4454 return 1; 4986 return 1;
4455 4987
4456 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 4988 if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
4457 return 1; 4989 return 1;
4458 4990
4459 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 4991 if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
4460 return 1; 4992 return 1;
4461 return 0; 4993 return 0;
4462} 4994}
@@ -4496,19 +5028,33 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4496 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); 5028 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
4497 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); 5029 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
4498 5030
4499 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 5031 /*
5032 * SDM says that segment selectors are loaded before segment
5033 * descriptors
5034 */
5035 kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
5036 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
5037 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
5038 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
5039 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
5040
5041 /*
5042 * Now load segment descriptors. If fault happenes at this stage
5043 * it is handled in a context of new task
5044 */
5045 if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
4500 return 1; 5046 return 1;
4501 5047
4502 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 5048 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
4503 return 1; 5049 return 1;
4504 5050
4505 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 5051 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
4506 return 1; 5052 return 1;
4507 5053
4508 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 5054 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
4509 return 1; 5055 return 1;
4510 5056
4511 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 5057 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
4512 return 1; 5058 return 1;
4513 return 0; 5059 return 0;
4514} 5060}
@@ -4530,7 +5076,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
4530 sizeof tss_segment_16)) 5076 sizeof tss_segment_16))
4531 goto out; 5077 goto out;
4532 5078
4533 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 5079 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
4534 &tss_segment_16, sizeof tss_segment_16)) 5080 &tss_segment_16, sizeof tss_segment_16))
4535 goto out; 5081 goto out;
4536 5082
@@ -4538,7 +5084,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
4538 tss_segment_16.prev_task_link = old_tss_sel; 5084 tss_segment_16.prev_task_link = old_tss_sel;
4539 5085
4540 if (kvm_write_guest(vcpu->kvm, 5086 if (kvm_write_guest(vcpu->kvm,
4541 get_tss_base_addr(vcpu, nseg_desc), 5087 get_tss_base_addr_write(vcpu, nseg_desc),
4542 &tss_segment_16.prev_task_link, 5088 &tss_segment_16.prev_task_link,
4543 sizeof tss_segment_16.prev_task_link)) 5089 sizeof tss_segment_16.prev_task_link))
4544 goto out; 5090 goto out;
@@ -4569,7 +5115,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
4569 sizeof tss_segment_32)) 5115 sizeof tss_segment_32))
4570 goto out; 5116 goto out;
4571 5117
4572 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 5118 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
4573 &tss_segment_32, sizeof tss_segment_32)) 5119 &tss_segment_32, sizeof tss_segment_32))
4574 goto out; 5120 goto out;
4575 5121
@@ -4577,7 +5123,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
4577 tss_segment_32.prev_task_link = old_tss_sel; 5123 tss_segment_32.prev_task_link = old_tss_sel;
4578 5124
4579 if (kvm_write_guest(vcpu->kvm, 5125 if (kvm_write_guest(vcpu->kvm,
4580 get_tss_base_addr(vcpu, nseg_desc), 5126 get_tss_base_addr_write(vcpu, nseg_desc),
4581 &tss_segment_32.prev_task_link, 5127 &tss_segment_32.prev_task_link,
4582 sizeof tss_segment_32.prev_task_link)) 5128 sizeof tss_segment_32.prev_task_link))
4583 goto out; 5129 goto out;
@@ -4600,7 +5146,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4600 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); 5146 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
4601 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); 5147 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
4602 5148
4603 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); 5149 old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
4604 5150
4605 /* FIXME: Handle errors. Failure to read either TSS or their 5151 /* FIXME: Handle errors. Failure to read either TSS or their
4606 * descriptors should generate a pagefault. 5152 * descriptors should generate a pagefault.
@@ -4659,7 +5205,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4659 &nseg_desc); 5205 &nseg_desc);
4660 } 5206 }
4661 5207
4662 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 5208 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
4663 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 5209 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
4664 tr_seg.type = 11; 5210 tr_seg.type = 11;
4665 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 5211 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
@@ -4690,17 +5236,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4690 5236
4691 kvm_set_cr8(vcpu, sregs->cr8); 5237 kvm_set_cr8(vcpu, sregs->cr8);
4692 5238
4693 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 5239 mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
4694 kvm_x86_ops->set_efer(vcpu, sregs->efer); 5240 kvm_x86_ops->set_efer(vcpu, sregs->efer);
4695 kvm_set_apic_base(vcpu, sregs->apic_base); 5241 kvm_set_apic_base(vcpu, sregs->apic_base);
4696 5242
4697 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 5243 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
4698
4699 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
4700 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 5244 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
4701 vcpu->arch.cr0 = sregs->cr0; 5245 vcpu->arch.cr0 = sregs->cr0;
4702 5246
4703 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 5247 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
4704 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5248 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4705 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5249 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
4706 load_pdptrs(vcpu, vcpu->arch.cr3); 5250 load_pdptrs(vcpu, vcpu->arch.cr3);
@@ -4735,7 +5279,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4735 /* Older userspace won't unhalt the vcpu on reset. */ 5279 /* Older userspace won't unhalt the vcpu on reset. */
4736 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && 5280 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
4737 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 5281 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4738 !(vcpu->arch.cr0 & X86_CR0_PE)) 5282 !is_protmode(vcpu))
4739 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5283 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4740 5284
4741 vcpu_put(vcpu); 5285 vcpu_put(vcpu);
@@ -4833,11 +5377,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
4833{ 5377{
4834 unsigned long vaddr = tr->linear_address; 5378 unsigned long vaddr = tr->linear_address;
4835 gpa_t gpa; 5379 gpa_t gpa;
5380 int idx;
4836 5381
4837 vcpu_load(vcpu); 5382 vcpu_load(vcpu);
4838 down_read(&vcpu->kvm->slots_lock); 5383 idx = srcu_read_lock(&vcpu->kvm->srcu);
4839 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 5384 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
4840 up_read(&vcpu->kvm->slots_lock); 5385 srcu_read_unlock(&vcpu->kvm->srcu, idx);
4841 tr->physical_address = gpa; 5386 tr->physical_address = gpa;
4842 tr->valid = gpa != UNMAPPED_GVA; 5387 tr->valid = gpa != UNMAPPED_GVA;
4843 tr->writeable = 1; 5388 tr->writeable = 1;
@@ -4918,14 +5463,14 @@ EXPORT_SYMBOL_GPL(fx_init);
4918 5463
4919void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 5464void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
4920{ 5465{
4921 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 5466 if (vcpu->guest_fpu_loaded)
4922 return; 5467 return;
4923 5468
4924 vcpu->guest_fpu_loaded = 1; 5469 vcpu->guest_fpu_loaded = 1;
4925 kvm_fx_save(&vcpu->arch.host_fx_image); 5470 kvm_fx_save(&vcpu->arch.host_fx_image);
4926 kvm_fx_restore(&vcpu->arch.guest_fx_image); 5471 kvm_fx_restore(&vcpu->arch.guest_fx_image);
5472 trace_kvm_fpu(1);
4927} 5473}
4928EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
4929 5474
4930void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 5475void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4931{ 5476{
@@ -4936,8 +5481,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4936 kvm_fx_save(&vcpu->arch.guest_fx_image); 5481 kvm_fx_save(&vcpu->arch.guest_fx_image);
4937 kvm_fx_restore(&vcpu->arch.host_fx_image); 5482 kvm_fx_restore(&vcpu->arch.host_fx_image);
4938 ++vcpu->stat.fpu_reload; 5483 ++vcpu->stat.fpu_reload;
5484 set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
5485 trace_kvm_fpu(0);
4939} 5486}
4940EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
4941 5487
4942void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 5488void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
4943{ 5489{
@@ -5089,11 +5635,13 @@ fail:
5089 5635
5090void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 5636void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
5091{ 5637{
5638 int idx;
5639
5092 kfree(vcpu->arch.mce_banks); 5640 kfree(vcpu->arch.mce_banks);
5093 kvm_free_lapic(vcpu); 5641 kvm_free_lapic(vcpu);
5094 down_read(&vcpu->kvm->slots_lock); 5642 idx = srcu_read_lock(&vcpu->kvm->srcu);
5095 kvm_mmu_destroy(vcpu); 5643 kvm_mmu_destroy(vcpu);
5096 up_read(&vcpu->kvm->slots_lock); 5644 srcu_read_unlock(&vcpu->kvm->srcu, idx);
5097 free_page((unsigned long)vcpu->arch.pio_data); 5645 free_page((unsigned long)vcpu->arch.pio_data);
5098} 5646}
5099 5647
@@ -5104,6 +5652,12 @@ struct kvm *kvm_arch_create_vm(void)
5104 if (!kvm) 5652 if (!kvm)
5105 return ERR_PTR(-ENOMEM); 5653 return ERR_PTR(-ENOMEM);
5106 5654
5655 kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
5656 if (!kvm->arch.aliases) {
5657 kfree(kvm);
5658 return ERR_PTR(-ENOMEM);
5659 }
5660
5107 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 5661 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5108 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 5662 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5109 5663
@@ -5160,16 +5714,18 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
5160 put_page(kvm->arch.apic_access_page); 5714 put_page(kvm->arch.apic_access_page);
5161 if (kvm->arch.ept_identity_pagetable) 5715 if (kvm->arch.ept_identity_pagetable)
5162 put_page(kvm->arch.ept_identity_pagetable); 5716 put_page(kvm->arch.ept_identity_pagetable);
5717 cleanup_srcu_struct(&kvm->srcu);
5718 kfree(kvm->arch.aliases);
5163 kfree(kvm); 5719 kfree(kvm);
5164} 5720}
5165 5721
5166int kvm_arch_set_memory_region(struct kvm *kvm, 5722int kvm_arch_prepare_memory_region(struct kvm *kvm,
5167 struct kvm_userspace_memory_region *mem, 5723 struct kvm_memory_slot *memslot,
5168 struct kvm_memory_slot old, 5724 struct kvm_memory_slot old,
5725 struct kvm_userspace_memory_region *mem,
5169 int user_alloc) 5726 int user_alloc)
5170{ 5727{
5171 int npages = mem->memory_size >> PAGE_SHIFT; 5728 int npages = memslot->npages;
5172 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
5173 5729
5174 /*To keep backward compatibility with older userspace, 5730 /*To keep backward compatibility with older userspace,
5175 *x86 needs to hanlde !user_alloc case. 5731 *x86 needs to hanlde !user_alloc case.
@@ -5189,26 +5745,35 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
5189 if (IS_ERR((void *)userspace_addr)) 5745 if (IS_ERR((void *)userspace_addr))
5190 return PTR_ERR((void *)userspace_addr); 5746 return PTR_ERR((void *)userspace_addr);
5191 5747
5192 /* set userspace_addr atomically for kvm_hva_to_rmapp */
5193 spin_lock(&kvm->mmu_lock);
5194 memslot->userspace_addr = userspace_addr; 5748 memslot->userspace_addr = userspace_addr;
5195 spin_unlock(&kvm->mmu_lock);
5196 } else {
5197 if (!old.user_alloc && old.rmap) {
5198 int ret;
5199
5200 down_write(&current->mm->mmap_sem);
5201 ret = do_munmap(current->mm, old.userspace_addr,
5202 old.npages * PAGE_SIZE);
5203 up_write(&current->mm->mmap_sem);
5204 if (ret < 0)
5205 printk(KERN_WARNING
5206 "kvm_vm_ioctl_set_memory_region: "
5207 "failed to munmap memory\n");
5208 }
5209 } 5749 }
5210 } 5750 }
5211 5751
5752
5753 return 0;
5754}
5755
5756void kvm_arch_commit_memory_region(struct kvm *kvm,
5757 struct kvm_userspace_memory_region *mem,
5758 struct kvm_memory_slot old,
5759 int user_alloc)
5760{
5761
5762 int npages = mem->memory_size >> PAGE_SHIFT;
5763
5764 if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
5765 int ret;
5766
5767 down_write(&current->mm->mmap_sem);
5768 ret = do_munmap(current->mm, old.userspace_addr,
5769 old.npages * PAGE_SIZE);
5770 up_write(&current->mm->mmap_sem);
5771 if (ret < 0)
5772 printk(KERN_WARNING
5773 "kvm_vm_ioctl_set_memory_region: "
5774 "failed to munmap memory\n");
5775 }
5776
5212 spin_lock(&kvm->mmu_lock); 5777 spin_lock(&kvm->mmu_lock);
5213 if (!kvm->arch.n_requested_mmu_pages) { 5778 if (!kvm->arch.n_requested_mmu_pages) {
5214 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 5779 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
@@ -5217,8 +5782,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
5217 5782
5218 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 5783 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
5219 spin_unlock(&kvm->mmu_lock); 5784 spin_unlock(&kvm->mmu_lock);
5220
5221 return 0;
5222} 5785}
5223 5786
5224void kvm_arch_flush_shadow(struct kvm *kvm) 5787void kvm_arch_flush_shadow(struct kvm *kvm)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 5eadea585d2a..2d101639bd8d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -2,6 +2,7 @@
2#define ARCH_X86_KVM_X86_H 2#define ARCH_X86_KVM_X86_H
3 3
4#include <linux/kvm_host.h> 4#include <linux/kvm_host.h>
5#include "kvm_cache_regs.h"
5 6
6static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) 7static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
7{ 8{
@@ -35,4 +36,33 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
35struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 36struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
36 u32 function, u32 index); 37 u32 function, u32 index);
37 38
39static inline bool is_protmode(struct kvm_vcpu *vcpu)
40{
41 return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
42}
43
44static inline int is_long_mode(struct kvm_vcpu *vcpu)
45{
46#ifdef CONFIG_X86_64
47 return vcpu->arch.efer & EFER_LMA;
48#else
49 return 0;
50#endif
51}
52
53static inline int is_pae(struct kvm_vcpu *vcpu)
54{
55 return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
56}
57
58static inline int is_pse(struct kvm_vcpu *vcpu)
59{
60 return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
61}
62
63static inline int is_paging(struct kvm_vcpu *vcpu)
64{
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66}
67
38#endif 68#endif
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index cffd754f3039..419386c24b82 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -14,7 +14,7 @@ $(obj)/inat.o: $(obj)/inat-tables.c
14 14
15clean-files := inat-tables.c 15clean-files := inat-tables.c
16 16
17obj-$(CONFIG_SMP) += msr-smp.o 17obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
18 18
19lib-y := delay.o 19lib-y := delay.o
20lib-y += thunk_$(BITS).o 20lib-y += thunk_$(BITS).o
@@ -34,9 +34,10 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y)
34endif 34endif
35 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o 35 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
36else 36else
37 obj-y += io_64.o iomap_copy_64.o 37 obj-y += iomap_copy_64.o
38 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o 38 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
39 lib-y += thunk_64.o clear_page_64.o copy_page_64.o 39 lib-y += thunk_64.o clear_page_64.o copy_page_64.o
40 lib-y += memmove_64.o memset_64.o 40 lib-y += memmove_64.o memset_64.o
41 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o 41 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
42 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
42endif 43endif
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
new file mode 100644
index 000000000000..a3c668875038
--- /dev/null
+++ b/arch/x86/lib/cache-smp.c
@@ -0,0 +1,19 @@
1#include <linux/smp.h>
2#include <linux/module.h>
3
4static void __wbinvd(void *dummy)
5{
6 wbinvd();
7}
8
9void wbinvd_on_cpu(int cpu)
10{
11 smp_call_function_single(cpu, __wbinvd, NULL, 1);
12}
13EXPORT_SYMBOL(wbinvd_on_cpu);
14
15int wbinvd_on_all_cpus(void)
16{
17 return on_each_cpu(__wbinvd, NULL, 1);
18}
19EXPORT_SYMBOL(wbinvd_on_all_cpus);
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index cf889d4e076a..71100c98e337 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -90,12 +90,6 @@ ENTRY(_copy_from_user)
90 CFI_ENDPROC 90 CFI_ENDPROC
91ENDPROC(_copy_from_user) 91ENDPROC(_copy_from_user)
92 92
93ENTRY(copy_user_generic)
94 CFI_STARTPROC
95 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
96 CFI_ENDPROC
97ENDPROC(copy_user_generic)
98
99 .section .fixup,"ax" 93 .section .fixup,"ax"
100 /* must zero dest */ 94 /* must zero dest */
101ENTRY(bad_from_user) 95ENTRY(bad_from_user)
diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c
deleted file mode 100644
index 3f1eb59b5f08..000000000000
--- a/arch/x86/lib/io_64.c
+++ /dev/null
@@ -1,25 +0,0 @@
1#include <linux/string.h>
2#include <linux/module.h>
3#include <asm/io.h>
4
5void __memcpy_toio(unsigned long dst, const void *src, unsigned len)
6{
7 __inline_memcpy((void *)dst, src, len);
8}
9EXPORT_SYMBOL(__memcpy_toio);
10
11void __memcpy_fromio(void *dst, unsigned long src, unsigned len)
12{
13 __inline_memcpy(dst, (const void *)src, len);
14}
15EXPORT_SYMBOL(__memcpy_fromio);
16
17void memset_io(volatile void __iomem *a, int b, size_t c)
18{
19 /*
20 * TODO: memset can mangle the IO patterns quite a bit.
21 * perhaps it would be better to use a dumb one:
22 */
23 memset((void *)a, b, c);
24}
25EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index ad5441ed1b57..f82e884928af 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -20,12 +20,11 @@
20/* 20/*
21 * memcpy_c() - fast string ops (REP MOVSQ) based variant. 21 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
22 * 22 *
23 * Calls to this get patched into the kernel image via the 23 * This gets patched over the unrolled variant (below) via the
24 * alternative instructions framework: 24 * alternative instructions framework:
25 */ 25 */
26 ALIGN 26 .section .altinstr_replacement, "ax", @progbits
27memcpy_c: 27.Lmemcpy_c:
28 CFI_STARTPROC
29 movq %rdi, %rax 28 movq %rdi, %rax
30 29
31 movl %edx, %ecx 30 movl %edx, %ecx
@@ -35,8 +34,8 @@ memcpy_c:
35 movl %edx, %ecx 34 movl %edx, %ecx
36 rep movsb 35 rep movsb
37 ret 36 ret
38 CFI_ENDPROC 37.Lmemcpy_e:
39ENDPROC(memcpy_c) 38 .previous
40 39
41ENTRY(__memcpy) 40ENTRY(__memcpy)
42ENTRY(memcpy) 41ENTRY(memcpy)
@@ -128,16 +127,10 @@ ENDPROC(__memcpy)
128 * It is also a lot simpler. Use this when possible: 127 * It is also a lot simpler. Use this when possible:
129 */ 128 */
130 129
131 .section .altinstr_replacement, "ax"
1321: .byte 0xeb /* jmp <disp8> */
133 .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
1342:
135 .previous
136
137 .section .altinstructions, "a" 130 .section .altinstructions, "a"
138 .align 8 131 .align 8
139 .quad memcpy 132 .quad memcpy
140 .quad 1b 133 .quad .Lmemcpy_c
141 .byte X86_FEATURE_REP_GOOD 134 .byte X86_FEATURE_REP_GOOD
142 135
143 /* 136 /*
@@ -145,6 +138,6 @@ ENDPROC(__memcpy)
145 * so it is silly to overwrite itself with nops - reboot is the 138 * so it is silly to overwrite itself with nops - reboot is the
146 * only outcome... 139 * only outcome...
147 */ 140 */
148 .byte 2b - 1b 141 .byte .Lmemcpy_e - .Lmemcpy_c
149 .byte 2b - 1b 142 .byte .Lmemcpy_e - .Lmemcpy_c
150 .previous 143 .previous
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 2c5948116bd2..e88d3b81644a 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -12,9 +12,8 @@
12 * 12 *
13 * rax original destination 13 * rax original destination
14 */ 14 */
15 ALIGN 15 .section .altinstr_replacement, "ax", @progbits
16memset_c: 16.Lmemset_c:
17 CFI_STARTPROC
18 movq %rdi,%r9 17 movq %rdi,%r9
19 movl %edx,%r8d 18 movl %edx,%r8d
20 andl $7,%r8d 19 andl $7,%r8d
@@ -29,8 +28,8 @@ memset_c:
29 rep stosb 28 rep stosb
30 movq %r9,%rax 29 movq %r9,%rax
31 ret 30 ret
32 CFI_ENDPROC 31.Lmemset_e:
33ENDPROC(memset_c) 32 .previous
34 33
35ENTRY(memset) 34ENTRY(memset)
36ENTRY(__memset) 35ENTRY(__memset)
@@ -118,16 +117,11 @@ ENDPROC(__memset)
118 117
119#include <asm/cpufeature.h> 118#include <asm/cpufeature.h>
120 119
121 .section .altinstr_replacement,"ax"
1221: .byte 0xeb /* jmp <disp8> */
123 .byte (memset_c - memset) - (2f - 1b) /* offset */
1242:
125 .previous
126 .section .altinstructions,"a" 120 .section .altinstructions,"a"
127 .align 8 121 .align 8
128 .quad memset 122 .quad memset
129 .quad 1b 123 .quad .Lmemset_c
130 .byte X86_FEATURE_REP_GOOD 124 .byte X86_FEATURE_REP_GOOD
131 .byte .Lfinal - memset 125 .byte .Lfinal - memset
132 .byte 2b - 1b 126 .byte .Lmemset_e - .Lmemset_c
133 .previous 127 .previous
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
new file mode 100644
index 000000000000..15acecf0d7aa
--- /dev/null
+++ b/arch/x86/lib/rwsem_64.S
@@ -0,0 +1,81 @@
1/*
2 * x86-64 rwsem wrappers
3 *
4 * This interfaces the inline asm code to the slow-path
5 * C routines. We need to save the call-clobbered regs
6 * that the asm does not mark as clobbered, and move the
7 * argument from %rax to %rdi.
8 *
9 * NOTE! We don't need to save %rax, because the functions
10 * will always return the semaphore pointer in %rax (which
11 * is also the input argument to these helpers)
12 *
13 * The following can clobber %rdx because the asm clobbers it:
14 * call_rwsem_down_write_failed
15 * call_rwsem_wake
16 * but %rdi, %rsi, %rcx, %r8-r11 always need saving.
17 */
18
19#include <linux/linkage.h>
20#include <asm/rwlock.h>
21#include <asm/alternative-asm.h>
22#include <asm/frame.h>
23#include <asm/dwarf2.h>
24
25#define save_common_regs \
26 pushq %rdi; \
27 pushq %rsi; \
28 pushq %rcx; \
29 pushq %r8; \
30 pushq %r9; \
31 pushq %r10; \
32 pushq %r11
33
34#define restore_common_regs \
35 popq %r11; \
36 popq %r10; \
37 popq %r9; \
38 popq %r8; \
39 popq %rcx; \
40 popq %rsi; \
41 popq %rdi
42
43/* Fix up special calling conventions */
44ENTRY(call_rwsem_down_read_failed)
45 save_common_regs
46 pushq %rdx
47 movq %rax,%rdi
48 call rwsem_down_read_failed
49 popq %rdx
50 restore_common_regs
51 ret
52 ENDPROC(call_rwsem_down_read_failed)
53
54ENTRY(call_rwsem_down_write_failed)
55 save_common_regs
56 movq %rax,%rdi
57 call rwsem_down_write_failed
58 restore_common_regs
59 ret
60 ENDPROC(call_rwsem_down_write_failed)
61
62ENTRY(call_rwsem_wake)
63 decw %dx /* do nothing if still outstanding active readers */
64 jnz 1f
65 save_common_regs
66 movq %rax,%rdi
67 call rwsem_wake
68 restore_common_regs
691: ret
70 ENDPROC(call_rwsem_wake)
71
72/* Fix up special calling conventions */
73ENTRY(call_rwsem_downgrade_wake)
74 save_common_regs
75 pushq %rdx
76 movq %rax,%rdi
77 call rwsem_downgrade_wake
78 popq %rdx
79 restore_common_regs
80 ret
81 ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index d406c5239019..e71c5cbc8f35 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -266,16 +266,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
266 if (!after_bootmem) 266 if (!after_bootmem)
267 find_early_table_space(end, use_pse, use_gbpages); 267 find_early_table_space(end, use_pse, use_gbpages);
268 268
269#ifdef CONFIG_X86_32
270 for (i = 0; i < nr_range; i++)
271 kernel_physical_mapping_init(mr[i].start, mr[i].end,
272 mr[i].page_size_mask);
273 ret = end;
274#else /* CONFIG_X86_64 */
275 for (i = 0; i < nr_range; i++) 269 for (i = 0; i < nr_range; i++)
276 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, 270 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
277 mr[i].page_size_mask); 271 mr[i].page_size_mask);
278#endif
279 272
280#ifdef CONFIG_X86_32 273#ifdef CONFIG_X86_32
281 early_ioremap_page_table_range_init(); 274 early_ioremap_page_table_range_init();
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9a0c258a86be..5cb3f0f54f47 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -241,6 +241,7 @@ kernel_physical_mapping_init(unsigned long start,
241 unsigned long page_size_mask) 241 unsigned long page_size_mask)
242{ 242{
243 int use_pse = page_size_mask == (1<<PG_LEVEL_2M); 243 int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
244 unsigned long last_map_addr = end;
244 unsigned long start_pfn, end_pfn; 245 unsigned long start_pfn, end_pfn;
245 pgd_t *pgd_base = swapper_pg_dir; 246 pgd_t *pgd_base = swapper_pg_dir;
246 int pgd_idx, pmd_idx, pte_ofs; 247 int pgd_idx, pmd_idx, pte_ofs;
@@ -341,9 +342,10 @@ repeat:
341 prot = PAGE_KERNEL_EXEC; 342 prot = PAGE_KERNEL_EXEC;
342 343
343 pages_4k++; 344 pages_4k++;
344 if (mapping_iter == 1) 345 if (mapping_iter == 1) {
345 set_pte(pte, pfn_pte(pfn, init_prot)); 346 set_pte(pte, pfn_pte(pfn, init_prot));
346 else 347 last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
348 } else
347 set_pte(pte, pfn_pte(pfn, prot)); 349 set_pte(pte, pfn_pte(pfn, prot));
348 } 350 }
349 } 351 }
@@ -368,7 +370,7 @@ repeat:
368 mapping_iter = 2; 370 mapping_iter = 2;
369 goto repeat; 371 goto repeat;
370 } 372 }
371 return 0; 373 return last_map_addr;
372} 374}
373 375
374pte_t *kmap_pte; 376pte_t *kmap_pte;
@@ -748,6 +750,7 @@ static void __init zone_sizes_init(void)
748 free_area_init_nodes(max_zone_pfns); 750 free_area_init_nodes(max_zone_pfns);
749} 751}
750 752
753#ifndef CONFIG_NO_BOOTMEM
751static unsigned long __init setup_node_bootmem(int nodeid, 754static unsigned long __init setup_node_bootmem(int nodeid,
752 unsigned long start_pfn, 755 unsigned long start_pfn,
753 unsigned long end_pfn, 756 unsigned long end_pfn,
@@ -764,13 +767,14 @@ static unsigned long __init setup_node_bootmem(int nodeid,
764 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n", 767 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
765 nodeid, bootmap, bootmap + bootmap_size); 768 nodeid, bootmap, bootmap + bootmap_size);
766 free_bootmem_with_active_regions(nodeid, end_pfn); 769 free_bootmem_with_active_regions(nodeid, end_pfn);
767 early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
768 770
769 return bootmap + bootmap_size; 771 return bootmap + bootmap_size;
770} 772}
773#endif
771 774
772void __init setup_bootmem_allocator(void) 775void __init setup_bootmem_allocator(void)
773{ 776{
777#ifndef CONFIG_NO_BOOTMEM
774 int nodeid; 778 int nodeid;
775 unsigned long bootmap_size, bootmap; 779 unsigned long bootmap_size, bootmap;
776 /* 780 /*
@@ -782,11 +786,13 @@ void __init setup_bootmem_allocator(void)
782 if (bootmap == -1L) 786 if (bootmap == -1L)
783 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 787 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
784 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); 788 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
789#endif
785 790
786 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 791 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
787 max_pfn_mapped<<PAGE_SHIFT); 792 max_pfn_mapped<<PAGE_SHIFT);
788 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); 793 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
789 794
795#ifndef CONFIG_NO_BOOTMEM
790 for_each_online_node(nodeid) { 796 for_each_online_node(nodeid) {
791 unsigned long start_pfn, end_pfn; 797 unsigned long start_pfn, end_pfn;
792 798
@@ -804,6 +810,7 @@ void __init setup_bootmem_allocator(void)
804 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, 810 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
805 bootmap); 811 bootmap);
806 } 812 }
813#endif
807 814
808 after_bootmem = 1; 815 after_bootmem = 1;
809} 816}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5198b9bb34ef..e9b040e1cde5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -49,6 +49,7 @@
49#include <asm/numa.h> 49#include <asm/numa.h>
50#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/init.h> 51#include <asm/init.h>
52#include <linux/bootmem.h>
52 53
53static unsigned long dma_reserve __initdata; 54static unsigned long dma_reserve __initdata;
54 55
@@ -571,6 +572,7 @@ kernel_physical_mapping_init(unsigned long start,
571void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 572void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
572 int acpi, int k8) 573 int acpi, int k8)
573{ 574{
575#ifndef CONFIG_NO_BOOTMEM
574 unsigned long bootmap_size, bootmap; 576 unsigned long bootmap_size, bootmap;
575 577
576 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; 578 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
@@ -578,13 +580,15 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
578 PAGE_SIZE); 580 PAGE_SIZE);
579 if (bootmap == -1L) 581 if (bootmap == -1L)
580 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 582 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
583 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
581 /* don't touch min_low_pfn */ 584 /* don't touch min_low_pfn */
582 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, 585 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
583 0, end_pfn); 586 0, end_pfn);
584 e820_register_active_regions(0, start_pfn, end_pfn); 587 e820_register_active_regions(0, start_pfn, end_pfn);
585 free_bootmem_with_active_regions(0, end_pfn); 588 free_bootmem_with_active_regions(0, end_pfn);
586 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); 589#else
587 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); 590 e820_register_active_regions(0, start_pfn, end_pfn);
591#endif
588} 592}
589#endif 593#endif
590 594
@@ -616,6 +620,21 @@ void __init paging_init(void)
616 */ 620 */
617#ifdef CONFIG_MEMORY_HOTPLUG 621#ifdef CONFIG_MEMORY_HOTPLUG
618/* 622/*
623 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
624 * updating.
625 */
626static void update_end_of_memory_vars(u64 start, u64 size)
627{
628 unsigned long end_pfn = PFN_UP(start + size);
629
630 if (end_pfn > max_pfn) {
631 max_pfn = end_pfn;
632 max_low_pfn = end_pfn;
633 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
634 }
635}
636
637/*
619 * Memory is added always to NORMAL zone. This means you will never get 638 * Memory is added always to NORMAL zone. This means you will never get
620 * additional DMA/DMA32 memory. 639 * additional DMA/DMA32 memory.
621 */ 640 */
@@ -634,6 +653,9 @@ int arch_add_memory(int nid, u64 start, u64 size)
634 ret = __add_pages(nid, zone, start_pfn, nr_pages); 653 ret = __add_pages(nid, zone, start_pfn, nr_pages);
635 WARN_ON_ONCE(ret); 654 WARN_ON_ONCE(ret);
636 655
656 /* update max_pfn, max_low_pfn and high_memory */
657 update_end_of_memory_vars(start, size);
658
637 return ret; 659 return ret;
638} 660}
639EXPORT_SYMBOL_GPL(arch_add_memory); 661EXPORT_SYMBOL_GPL(arch_add_memory);
@@ -955,7 +977,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
955 if (pmd_none(*pmd)) { 977 if (pmd_none(*pmd)) {
956 pte_t entry; 978 pte_t entry;
957 979
958 p = vmemmap_alloc_block(PMD_SIZE, node); 980 p = vmemmap_alloc_block_buf(PMD_SIZE, node);
959 if (!p) 981 if (!p)
960 return -ENOMEM; 982 return -ENOMEM;
961 983
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c246d259822d..5eb1ba74a3a9 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,43 +24,6 @@
24 24
25#include "physaddr.h" 25#include "physaddr.h"
26 26
27int page_is_ram(unsigned long pagenr)
28{
29 resource_size_t addr, end;
30 int i;
31
32 /*
33 * A special case is the first 4Kb of memory;
34 * This is a BIOS owned area, not kernel ram, but generally
35 * not listed as such in the E820 table.
36 */
37 if (pagenr == 0)
38 return 0;
39
40 /*
41 * Second special case: Some BIOSen report the PC BIOS
42 * area (640->1Mb) as ram even though it is not.
43 */
44 if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
45 pagenr < (BIOS_END >> PAGE_SHIFT))
46 return 0;
47
48 for (i = 0; i < e820.nr_map; i++) {
49 /*
50 * Not usable memory:
51 */
52 if (e820.map[i].type != E820_RAM)
53 continue;
54 addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
55 end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
56
57
58 if ((pagenr >= addr) && (pagenr < end))
59 return 1;
60 }
61 return 0;
62}
63
64/* 27/*
65 * Fix up the linear direct mapping of the kernel to avoid cache attribute 28 * Fix up the linear direct mapping of the kernel to avoid cache attribute
66 * conflicts. 29 * conflicts.
@@ -422,6 +385,10 @@ void __init early_ioremap_init(void)
422 * The boot-ioremap range spans multiple pmds, for which 385 * The boot-ioremap range spans multiple pmds, for which
423 * we are not prepared: 386 * we are not prepared:
424 */ 387 */
388#define __FIXADDR_TOP (-PAGE_SIZE)
389 BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
390 != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
391#undef __FIXADDR_TOP
425 if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { 392 if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
426 WARN_ON(1); 393 WARN_ON(1);
427 printk(KERN_WARNING "pmd %p != %p\n", 394 printk(KERN_WARNING "pmd %p != %p\n",
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index 8cc183344140..b3b531a4f8e5 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -337,7 +337,7 @@ bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
337 if (!shadow) 337 if (!shadow)
338 return true; 338 return true;
339 339
340 status = kmemcheck_shadow_test(shadow, size); 340 status = kmemcheck_shadow_test_all(shadow, size);
341 341
342 return status == KMEMCHECK_SHADOW_INITIALIZED; 342 return status == KMEMCHECK_SHADOW_INITIALIZED;
343} 343}
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
index 3f66b82076a3..aec124214d97 100644
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -125,12 +125,12 @@ void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
125 125
126enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) 126enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
127{ 127{
128#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
128 uint8_t *x; 129 uint8_t *x;
129 unsigned int i; 130 unsigned int i;
130 131
131 x = shadow; 132 x = shadow;
132 133
133#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
134 /* 134 /*
135 * Make sure _some_ bytes are initialized. Gcc frequently generates 135 * Make sure _some_ bytes are initialized. Gcc frequently generates
136 * code to access neighboring bytes. 136 * code to access neighboring bytes.
@@ -139,13 +139,25 @@ enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
139 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) 139 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
140 return x[i]; 140 return x[i];
141 } 141 }
142
143 return x[0];
142#else 144#else
145 return kmemcheck_shadow_test_all(shadow, size);
146#endif
147}
148
149enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
150{
151 uint8_t *x;
152 unsigned int i;
153
154 x = shadow;
155
143 /* All bytes must be initialized. */ 156 /* All bytes must be initialized. */
144 for (i = 0; i < size; ++i) { 157 for (i = 0; i < size; ++i) {
145 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) 158 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
146 return x[i]; 159 return x[i];
147 } 160 }
148#endif
149 161
150 return x[0]; 162 return x[0];
151} 163}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
index af46d9ab9d86..ff0b2f70fbcb 100644
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -11,6 +11,8 @@ enum kmemcheck_shadow {
11void *kmemcheck_shadow_lookup(unsigned long address); 11void *kmemcheck_shadow_lookup(unsigned long address);
12 12
13enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); 13enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
14enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
15 unsigned int size);
14void kmemcheck_shadow_set(void *shadow, unsigned int size); 16void kmemcheck_shadow_set(void *shadow, unsigned int size);
15 17
16#endif 18#endif
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index c8191defc38a..1dab5194fd9d 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -71,7 +71,7 @@ static int mmap_is_legacy(void)
71 if (current->personality & ADDR_COMPAT_LAYOUT) 71 if (current->personality & ADDR_COMPAT_LAYOUT)
72 return 1; 72 return 1;
73 73
74 if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) 74 if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
75 return 1; 75 return 1;
76 76
77 return sysctl_legacy_va_layout; 77 return sysctl_legacy_va_layout;
@@ -96,7 +96,7 @@ static unsigned long mmap_rnd(void)
96 96
97static unsigned long mmap_base(void) 97static unsigned long mmap_base(void)
98{ 98{
99 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; 99 unsigned long gap = rlimit(RLIMIT_STACK);
100 100
101 if (gap < MIN_GAP) 101 if (gap < MIN_GAP)
102 gap = MIN_GAP; 102 gap = MIN_GAP;
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index b20760ca7244..809baaaf48b1 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -418,7 +418,10 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
418 418
419 for_each_online_node(nid) { 419 for_each_online_node(nid) {
420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
421 NODE_DATA(nid)->node_id = nid;
422#ifndef CONFIG_NO_BOOTMEM
421 NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; 423 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
424#endif
422 } 425 }
423 426
424 setup_bootmem_allocator(); 427 setup_bootmem_allocator();
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 83bbc70d11bb..8948f47fde05 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -163,30 +163,48 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
163 unsigned long end, unsigned long size, 163 unsigned long end, unsigned long size,
164 unsigned long align) 164 unsigned long align)
165{ 165{
166 unsigned long mem = find_e820_area(start, end, size, align); 166 unsigned long mem;
167 void *ptr;
168 167
168 /*
169 * put it on high as possible
170 * something will go with NODE_DATA
171 */
172 if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
173 start = MAX_DMA_PFN<<PAGE_SHIFT;
174 if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
175 end > (MAX_DMA32_PFN<<PAGE_SHIFT))
176 start = MAX_DMA32_PFN<<PAGE_SHIFT;
177 mem = find_e820_area(start, end, size, align);
178 if (mem != -1L)
179 return __va(mem);
180
181 /* extend the search scope */
182 end = max_pfn_mapped << PAGE_SHIFT;
183 if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
184 start = MAX_DMA32_PFN<<PAGE_SHIFT;
185 else
186 start = MAX_DMA_PFN<<PAGE_SHIFT;
187 mem = find_e820_area(start, end, size, align);
169 if (mem != -1L) 188 if (mem != -1L)
170 return __va(mem); 189 return __va(mem);
171 190
172 ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); 191 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
173 if (ptr == NULL) {
174 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
175 size, nodeid); 192 size, nodeid);
176 return NULL; 193
177 } 194 return NULL;
178 return ptr;
179} 195}
180 196
181/* Initialize bootmem allocator for a node */ 197/* Initialize bootmem allocator for a node */
182void __init 198void __init
183setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 199setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
184{ 200{
185 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 201 unsigned long start_pfn, last_pfn, nodedata_phys;
186 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 202 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
187 unsigned long bootmap_start, nodedata_phys;
188 void *bootmap;
189 int nid; 203 int nid;
204#ifndef CONFIG_NO_BOOTMEM
205 unsigned long bootmap_start, bootmap_pages, bootmap_size;
206 void *bootmap;
207#endif
190 208
191 if (!end) 209 if (!end)
192 return; 210 return;
@@ -200,7 +218,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
200 218
201 start = roundup(start, ZONE_ALIGN); 219 start = roundup(start, ZONE_ALIGN);
202 220
203 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 221 printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
204 start, end); 222 start, end);
205 223
206 start_pfn = start >> PAGE_SHIFT; 224 start_pfn = start >> PAGE_SHIFT;
@@ -211,14 +229,21 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
211 if (node_data[nodeid] == NULL) 229 if (node_data[nodeid] == NULL)
212 return; 230 return;
213 nodedata_phys = __pa(node_data[nodeid]); 231 nodedata_phys = __pa(node_data[nodeid]);
232 reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
214 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, 233 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
215 nodedata_phys + pgdat_size - 1); 234 nodedata_phys + pgdat_size - 1);
235 nid = phys_to_nid(nodedata_phys);
236 if (nid != nodeid)
237 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
216 238
217 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 239 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
218 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; 240 NODE_DATA(nodeid)->node_id = nodeid;
219 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 241 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
220 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 242 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
221 243
244#ifndef CONFIG_NO_BOOTMEM
245 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
246
222 /* 247 /*
223 * Find a place for the bootmem map 248 * Find a place for the bootmem map
224 * nodedata_phys could be on other nodes by alloc_bootmem, 249 * nodedata_phys could be on other nodes by alloc_bootmem,
@@ -227,11 +252,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
227 * of alloc_bootmem, that could clash with reserved range 252 * of alloc_bootmem, that could clash with reserved range
228 */ 253 */
229 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); 254 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
230 nid = phys_to_nid(nodedata_phys); 255 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
231 if (nid == nodeid)
232 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
233 else
234 bootmap_start = roundup(start, PAGE_SIZE);
235 /* 256 /*
236 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like 257 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
237 * to use that to align to PAGE_SIZE 258 * to use that to align to PAGE_SIZE
@@ -239,18 +260,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 260 bootmap = early_node_mem(nodeid, bootmap_start, end,
240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); 261 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
241 if (bootmap == NULL) { 262 if (bootmap == NULL) {
242 if (nodedata_phys < start || nodedata_phys >= end) { 263 free_early(nodedata_phys, nodedata_phys + pgdat_size);
243 /*
244 * only need to free it if it is from other node
245 * bootmem
246 */
247 if (nid != nodeid)
248 free_bootmem(nodedata_phys, pgdat_size);
249 }
250 node_data[nodeid] = NULL; 264 node_data[nodeid] = NULL;
251 return; 265 return;
252 } 266 }
253 bootmap_start = __pa(bootmap); 267 bootmap_start = __pa(bootmap);
268 reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
269 "BOOTMAP");
254 270
255 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 271 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
256 bootmap_start >> PAGE_SHIFT, 272 bootmap_start >> PAGE_SHIFT,
@@ -259,31 +275,12 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
259 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 275 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
260 bootmap_start, bootmap_start + bootmap_size - 1, 276 bootmap_start, bootmap_start + bootmap_size - 1,
261 bootmap_pages); 277 bootmap_pages);
262
263 free_bootmem_with_active_regions(nodeid, end);
264
265 /*
266 * convert early reserve to bootmem reserve earlier
267 * otherwise early_node_mem could use early reserved mem
268 * on previous node
269 */
270 early_res_to_bootmem(start, end);
271
272 /*
273 * in some case early_node_mem could use alloc_bootmem
274 * to get range on other node, don't reserve that again
275 */
276 if (nid != nodeid)
277 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
278 else
279 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
280 pgdat_size, BOOTMEM_DEFAULT);
281 nid = phys_to_nid(bootmap_start); 278 nid = phys_to_nid(bootmap_start);
282 if (nid != nodeid) 279 if (nid != nodeid)
283 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); 280 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
284 else 281
285 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, 282 free_bootmem_with_active_regions(nodeid, end);
286 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); 283#endif
287 284
288 node_set_online(nodeid); 285 node_set_online(nodeid);
289} 286}
@@ -427,7 +424,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
427 * Calculate the number of big nodes that can be allocated as a result 424 * Calculate the number of big nodes that can be allocated as a result
428 * of consolidating the remainder. 425 * of consolidating the remainder.
429 */ 426 */
430 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / 427 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
431 FAKE_NODE_MIN_SIZE; 428 FAKE_NODE_MIN_SIZE;
432 429
433 size &= FAKE_NODE_MIN_HASH_MASK; 430 size &= FAKE_NODE_MIN_HASH_MASK;
@@ -502,77 +499,99 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
502} 499}
503 500
504/* 501/*
505 * Splits num_nodes nodes up equally starting at node_start. The return value 502 * Returns the end address of a node so that there is at least `size' amount of
506 * is the number of nodes split up and addr is adjusted to be at the end of the 503 * non-reserved memory or `max_addr' is reached.
507 * last node allocated.
508 */ 504 */
509static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, 505static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
510 int num_nodes)
511{ 506{
512 unsigned int big; 507 u64 end = start + size;
513 u64 size;
514 int i;
515 508
516 if (num_nodes <= 0) 509 while (end - start - e820_hole_size(start, end) < size) {
517 return -1; 510 end += FAKE_NODE_MIN_SIZE;
518 if (num_nodes > MAX_NUMNODES) 511 if (end > max_addr) {
519 num_nodes = MAX_NUMNODES;
520 size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
521 num_nodes;
522 /*
523 * Calculate the number of big nodes that can be allocated as a result
524 * of consolidating the leftovers.
525 */
526 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
527 FAKE_NODE_MIN_SIZE;
528
529 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
530 size &= FAKE_NODE_MIN_HASH_MASK;
531 if (!size) {
532 printk(KERN_ERR "Not enough memory for each node. "
533 "NUMA emulation disabled.\n");
534 return -1;
535 }
536
537 for (i = node_start; i < num_nodes + node_start; i++) {
538 u64 end = *addr + size;
539
540 if (i < big)
541 end += FAKE_NODE_MIN_SIZE;
542 /*
543 * The final node can have the remaining system RAM. Other
544 * nodes receive roughly the same amount of available pages.
545 */
546 if (i == num_nodes + node_start - 1)
547 end = max_addr; 512 end = max_addr;
548 else
549 while (end - *addr - e820_hole_size(*addr, end) <
550 size) {
551 end += FAKE_NODE_MIN_SIZE;
552 if (end > max_addr) {
553 end = max_addr;
554 break;
555 }
556 }
557 if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
558 break; 513 break;
514 }
559 } 515 }
560 return i - node_start + 1; 516 return end;
561} 517}
562 518
563/* 519/*
564 * Splits the remaining system RAM into chunks of size. The remaining memory is 520 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
565 * always assigned to a final node and can be asymmetric. Returns the number of 521 * `addr' to `max_addr'. The return value is the number of nodes allocated.
566 * nodes split.
567 */ 522 */
568static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, 523static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
569 u64 size)
570{ 524{
571 int i = node_start; 525 nodemask_t physnode_mask = NODE_MASK_NONE;
572 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 526 u64 min_size;
573 while (!setup_node_range(i++, addr, size, max_addr)) 527 int ret = 0;
574 ; 528 int i;
575 return i - node_start; 529
530 if (!size)
531 return -1;
532 /*
533 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
534 * increased accordingly if the requested size is too small. This
535 * creates a uniform distribution of node sizes across the entire
536 * machine (but not necessarily over physical nodes).
537 */
538 min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) /
539 MAX_NUMNODES;
540 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
541 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
542 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
543 FAKE_NODE_MIN_HASH_MASK;
544 if (size < min_size) {
545 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
546 size >> 20, min_size >> 20);
547 size = min_size;
548 }
549 size &= FAKE_NODE_MIN_HASH_MASK;
550
551 for (i = 0; i < MAX_NUMNODES; i++)
552 if (physnodes[i].start != physnodes[i].end)
553 node_set(i, physnode_mask);
554 /*
555 * Fill physical nodes with fake nodes of size until there is no memory
556 * left on any of them.
557 */
558 while (nodes_weight(physnode_mask)) {
559 for_each_node_mask(i, physnode_mask) {
560 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
561 u64 end;
562
563 end = find_end_of_node(physnodes[i].start,
564 physnodes[i].end, size);
565 /*
566 * If there won't be at least FAKE_NODE_MIN_SIZE of
567 * non-reserved memory in ZONE_DMA32 for the next node,
568 * this one must extend to the boundary.
569 */
570 if (end < dma32_end && dma32_end - end -
571 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
572 end = dma32_end;
573
574 /*
575 * If there won't be enough non-reserved memory for the
576 * next node, this one must extend to the end of the
577 * physical node.
578 */
579 if (physnodes[i].end - end -
580 e820_hole_size(end, physnodes[i].end) < size)
581 end = physnodes[i].end;
582
583 /*
584 * Setup the fake node that will be allocated as bootmem
585 * later. If setup_node_range() returns non-zero, there
586 * is no more memory available on this physical node.
587 */
588 if (setup_node_range(ret++, &physnodes[i].start,
589 end - physnodes[i].start,
590 physnodes[i].end) < 0)
591 node_clear(i, physnode_mask);
592 }
593 }
594 return ret;
576} 595}
577 596
578/* 597/*
@@ -582,87 +601,32 @@ static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
582static int __init numa_emulation(unsigned long start_pfn, 601static int __init numa_emulation(unsigned long start_pfn,
583 unsigned long last_pfn, int acpi, int k8) 602 unsigned long last_pfn, int acpi, int k8)
584{ 603{
585 u64 size, addr = start_pfn << PAGE_SHIFT; 604 u64 addr = start_pfn << PAGE_SHIFT;
586 u64 max_addr = last_pfn << PAGE_SHIFT; 605 u64 max_addr = last_pfn << PAGE_SHIFT;
587 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
588 int num_phys_nodes; 606 int num_phys_nodes;
607 int num_nodes;
608 int i;
589 609
590 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); 610 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
591 /* 611 /*
592 * If the numa=fake command-line is just a single number N, split the 612 * If the numa=fake command-line contains a 'M' or 'G', it represents
593 * system RAM into N fake nodes. 613 * the fixed node size. Otherwise, if it is just a single number N,
614 * split the system RAM into N fake nodes.
594 */ 615 */
595 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 616 if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
596 long n = simple_strtol(cmdline, NULL, 0); 617 u64 size;
597
598 num_nodes = split_nodes_interleave(addr, max_addr,
599 num_phys_nodes, n);
600 if (num_nodes < 0)
601 return num_nodes;
602 goto out;
603 }
604 618
605 /* Parse the command line. */ 619 size = memparse(cmdline, &cmdline);
606 for (coeff_flag = 0; ; cmdline++) { 620 num_nodes = split_nodes_size_interleave(addr, max_addr, size);
607 if (*cmdline && isdigit(*cmdline)) { 621 } else {
608 num = num * 10 + *cmdline - '0'; 622 unsigned long n;
609 continue; 623
610 } 624 n = simple_strtoul(cmdline, NULL, 0);
611 if (*cmdline == '*') { 625 num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
612 if (num > 0)
613 coeff = num;
614 coeff_flag = 1;
615 }
616 if (!*cmdline || *cmdline == ',') {
617 if (!coeff_flag)
618 coeff = 1;
619 /*
620 * Round down to the nearest FAKE_NODE_MIN_SIZE.
621 * Command-line coefficients are in megabytes.
622 */
623 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
624 if (size)
625 for (i = 0; i < coeff; i++, num_nodes++)
626 if (setup_node_range(num_nodes, &addr,
627 size, max_addr) < 0)
628 goto done;
629 if (!*cmdline)
630 break;
631 coeff_flag = 0;
632 coeff = -1;
633 }
634 num = 0;
635 }
636done:
637 if (!num_nodes)
638 return -1;
639 /* Fill remainder of system RAM, if appropriate. */
640 if (addr < max_addr) {
641 if (coeff_flag && coeff < 0) {
642 /* Split remaining nodes into num-sized chunks */
643 num_nodes += split_nodes_by_size(&addr, max_addr,
644 num_nodes, num);
645 goto out;
646 }
647 switch (*(cmdline - 1)) {
648 case '*':
649 /* Split remaining nodes into coeff chunks */
650 if (coeff <= 0)
651 break;
652 num_nodes += split_nodes_equally(&addr, max_addr,
653 num_nodes, coeff);
654 break;
655 case ',':
656 /* Do not allocate remaining system RAM */
657 break;
658 default:
659 /* Give one final node */
660 setup_node_range(num_nodes, &addr, max_addr - addr,
661 max_addr);
662 num_nodes++;
663 }
664 } 626 }
665out: 627
628 if (num_nodes < 0)
629 return num_nodes;
666 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); 630 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
667 if (memnode_shift < 0) { 631 if (memnode_shift < 0) {
668 memnode_shift = 0; 632 memnode_shift = 0;
@@ -742,6 +706,10 @@ unsigned long __init numa_free_all_bootmem(void)
742 for_each_online_node(i) 706 for_each_online_node(i)
743 pages += free_all_bootmem_node(NODE_DATA(i)); 707 pages += free_all_bootmem_node(NODE_DATA(i));
744 708
709#ifdef CONFIG_NO_BOOTMEM
710 pages += free_all_memory_core_early(MAX_NUMNODES);
711#endif
712
745 return pages; 713 return pages;
746} 714}
747 715
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ed34f5e35999..c9ba9deafe83 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -6,6 +6,14 @@
6 6
7#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO 7#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
8 8
9#ifdef CONFIG_HIGHPTE
10#define PGALLOC_USER_GFP __GFP_HIGHMEM
11#else
12#define PGALLOC_USER_GFP 0
13#endif
14
15gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
16
9pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 17pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
10{ 18{
11 return (pte_t *)__get_free_page(PGALLOC_GFP); 19 return (pte_t *)__get_free_page(PGALLOC_GFP);
@@ -15,16 +23,29 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
15{ 23{
16 struct page *pte; 24 struct page *pte;
17 25
18#ifdef CONFIG_HIGHPTE 26 pte = alloc_pages(__userpte_alloc_gfp, 0);
19 pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0);
20#else
21 pte = alloc_pages(PGALLOC_GFP, 0);
22#endif
23 if (pte) 27 if (pte)
24 pgtable_page_ctor(pte); 28 pgtable_page_ctor(pte);
25 return pte; 29 return pte;
26} 30}
27 31
32static int __init setup_userpte(char *arg)
33{
34 if (!arg)
35 return -EINVAL;
36
37 /*
38 * "userpte=nohigh" disables allocation of user pagetables in
39 * high memory.
40 */
41 if (strcmp(arg, "nohigh") == 0)
42 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
43 else
44 return -EINVAL;
45 return 0;
46}
47early_param("userpte", setup_userpte);
48
28void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) 49void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
29{ 50{
30 pgtable_page_dtor(pte); 51 pgtable_page_dtor(pte);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 65b58e4b0b8b..426f3a1a64d3 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -41,7 +41,7 @@ union smp_flush_state {
41 struct { 41 struct {
42 struct mm_struct *flush_mm; 42 struct mm_struct *flush_mm;
43 unsigned long flush_va; 43 unsigned long flush_va;
44 spinlock_t tlbstate_lock; 44 raw_spinlock_t tlbstate_lock;
45 DECLARE_BITMAP(flush_cpumask, NR_CPUS); 45 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
46 }; 46 };
47 char pad[INTERNODE_CACHE_BYTES]; 47 char pad[INTERNODE_CACHE_BYTES];
@@ -181,7 +181,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
181 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is 181 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
182 * probably not worth checking this for a cache-hot lock. 182 * probably not worth checking this for a cache-hot lock.
183 */ 183 */
184 spin_lock(&f->tlbstate_lock); 184 raw_spin_lock(&f->tlbstate_lock);
185 185
186 f->flush_mm = mm; 186 f->flush_mm = mm;
187 f->flush_va = va; 187 f->flush_va = va;
@@ -199,7 +199,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
199 199
200 f->flush_mm = NULL; 200 f->flush_mm = NULL;
201 f->flush_va = 0; 201 f->flush_va = 0;
202 spin_unlock(&f->tlbstate_lock); 202 raw_spin_unlock(&f->tlbstate_lock);
203} 203}
204 204
205void native_flush_tlb_others(const struct cpumask *cpumask, 205void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -223,7 +223,7 @@ static int __cpuinit init_smp_flush(void)
223 int i; 223 int i;
224 224
225 for (i = 0; i < ARRAY_SIZE(flush_state); i++) 225 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
226 spin_lock_init(&flush_state[i].tlbstate_lock); 226 raw_spin_lock_init(&flush_state[i].tlbstate_lock);
227 227
228 return 0; 228 return 0;
229} 229}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 3347f696edc7..2c505ee71014 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -159,7 +159,7 @@ static int nmi_setup_mux(void)
159 159
160 for_each_possible_cpu(i) { 160 for_each_possible_cpu(i) {
161 per_cpu(cpu_msrs, i).multiplex = 161 per_cpu(cpu_msrs, i).multiplex =
162 kmalloc(multiplex_size, GFP_KERNEL); 162 kzalloc(multiplex_size, GFP_KERNEL);
163 if (!per_cpu(cpu_msrs, i).multiplex) 163 if (!per_cpu(cpu_msrs, i).multiplex)
164 return 0; 164 return 0;
165 } 165 }
@@ -179,7 +179,6 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
179 if (counter_config[i].enabled) { 179 if (counter_config[i].enabled) {
180 multiplex[i].saved = -(u64)counter_config[i].count; 180 multiplex[i].saved = -(u64)counter_config[i].count;
181 } else { 181 } else {
182 multiplex[i].addr = 0;
183 multiplex[i].saved = 0; 182 multiplex[i].saved = 0;
184 } 183 }
185 } 184 }
@@ -189,25 +188,27 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
189 188
190static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) 189static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs)
191{ 190{
191 struct op_msr *counters = msrs->counters;
192 struct op_msr *multiplex = msrs->multiplex; 192 struct op_msr *multiplex = msrs->multiplex;
193 int i; 193 int i;
194 194
195 for (i = 0; i < model->num_counters; ++i) { 195 for (i = 0; i < model->num_counters; ++i) {
196 int virt = op_x86_phys_to_virt(i); 196 int virt = op_x86_phys_to_virt(i);
197 if (multiplex[virt].addr) 197 if (counters[i].addr)
198 rdmsrl(multiplex[virt].addr, multiplex[virt].saved); 198 rdmsrl(counters[i].addr, multiplex[virt].saved);
199 } 199 }
200} 200}
201 201
202static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) 202static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs)
203{ 203{
204 struct op_msr *counters = msrs->counters;
204 struct op_msr *multiplex = msrs->multiplex; 205 struct op_msr *multiplex = msrs->multiplex;
205 int i; 206 int i;
206 207
207 for (i = 0; i < model->num_counters; ++i) { 208 for (i = 0; i < model->num_counters; ++i) {
208 int virt = op_x86_phys_to_virt(i); 209 int virt = op_x86_phys_to_virt(i);
209 if (multiplex[virt].addr) 210 if (counters[i].addr)
210 wrmsrl(multiplex[virt].addr, multiplex[virt].saved); 211 wrmsrl(counters[i].addr, multiplex[virt].saved);
211 } 212 }
212} 213}
213 214
@@ -303,11 +304,11 @@ static int allocate_msrs(void)
303 304
304 int i; 305 int i;
305 for_each_possible_cpu(i) { 306 for_each_possible_cpu(i) {
306 per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, 307 per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
307 GFP_KERNEL); 308 GFP_KERNEL);
308 if (!per_cpu(cpu_msrs, i).counters) 309 if (!per_cpu(cpu_msrs, i).counters)
309 return 0; 310 return 0;
310 per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, 311 per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
311 GFP_KERNEL); 312 GFP_KERNEL);
312 if (!per_cpu(cpu_msrs, i).controls) 313 if (!per_cpu(cpu_msrs, i).controls)
313 return 0; 314 return 0;
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 39686c29f03a..090cbbec7dbd 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -22,6 +22,9 @@
22#include <asm/ptrace.h> 22#include <asm/ptrace.h>
23#include <asm/msr.h> 23#include <asm/msr.h>
24#include <asm/nmi.h> 24#include <asm/nmi.h>
25#include <asm/apic.h>
26#include <asm/processor.h>
27#include <asm/cpufeature.h>
25 28
26#include "op_x86_model.h" 29#include "op_x86_model.h"
27#include "op_counter.h" 30#include "op_counter.h"
@@ -43,23 +46,10 @@
43 46
44static unsigned long reset_value[NUM_VIRT_COUNTERS]; 47static unsigned long reset_value[NUM_VIRT_COUNTERS];
45 48
46#ifdef CONFIG_OPROFILE_IBS
47
48/* IbsFetchCtl bits/masks */
49#define IBS_FETCH_RAND_EN (1ULL<<57)
50#define IBS_FETCH_VAL (1ULL<<49)
51#define IBS_FETCH_ENABLE (1ULL<<48)
52#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL
53
54/*IbsOpCtl bits */
55#define IBS_OP_CNT_CTL (1ULL<<19)
56#define IBS_OP_VAL (1ULL<<18)
57#define IBS_OP_ENABLE (1ULL<<17)
58
59#define IBS_FETCH_SIZE 6 49#define IBS_FETCH_SIZE 6
60#define IBS_OP_SIZE 12 50#define IBS_OP_SIZE 12
61 51
62static int has_ibs; /* AMD Family10h and later */ 52static u32 ibs_caps;
63 53
64struct op_ibs_config { 54struct op_ibs_config {
65 unsigned long op_enabled; 55 unsigned long op_enabled;
@@ -71,24 +61,52 @@ struct op_ibs_config {
71}; 61};
72 62
73static struct op_ibs_config ibs_config; 63static struct op_ibs_config ibs_config;
64static u64 ibs_op_ctl;
74 65
75#endif 66/*
67 * IBS cpuid feature detection
68 */
76 69
77#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX 70#define IBS_CPUID_FEATURES 0x8000001b
71
72/*
73 * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
74 * bit 0 is used to indicate the existence of IBS.
75 */
76#define IBS_CAPS_AVAIL (1LL<<0)
77#define IBS_CAPS_RDWROPCNT (1LL<<3)
78#define IBS_CAPS_OPCNT (1LL<<4)
78 79
79static void op_mux_fill_in_addresses(struct op_msrs * const msrs) 80/*
81 * IBS randomization macros
82 */
83#define IBS_RANDOM_BITS 12
84#define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1)
85#define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5))
86
87static u32 get_ibs_caps(void)
80{ 88{
81 int i; 89 u32 ibs_caps;
90 unsigned int max_level;
82 91
83 for (i = 0; i < NUM_VIRT_COUNTERS; i++) { 92 if (!boot_cpu_has(X86_FEATURE_IBS))
84 int hw_counter = op_x86_virt_to_phys(i); 93 return 0;
85 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) 94
86 msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; 95 /* check IBS cpuid feature flags */
87 else 96 max_level = cpuid_eax(0x80000000);
88 msrs->multiplex[i].addr = 0; 97 if (max_level < IBS_CPUID_FEATURES)
89 } 98 return IBS_CAPS_AVAIL;
99
100 ibs_caps = cpuid_eax(IBS_CPUID_FEATURES);
101 if (!(ibs_caps & IBS_CAPS_AVAIL))
102 /* cpuid flags not valid */
103 return IBS_CAPS_AVAIL;
104
105 return ibs_caps;
90} 106}
91 107
108#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
109
92static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, 110static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
93 struct op_msrs const * const msrs) 111 struct op_msrs const * const msrs)
94{ 112{
@@ -98,7 +116,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
98 /* enable active counters */ 116 /* enable active counters */
99 for (i = 0; i < NUM_COUNTERS; ++i) { 117 for (i = 0; i < NUM_COUNTERS; ++i) {
100 int virt = op_x86_phys_to_virt(i); 118 int virt = op_x86_phys_to_virt(i);
101 if (!counter_config[virt].enabled) 119 if (!reset_value[virt])
102 continue; 120 continue;
103 rdmsrl(msrs->controls[i].addr, val); 121 rdmsrl(msrs->controls[i].addr, val);
104 val &= model->reserved; 122 val &= model->reserved;
@@ -107,10 +125,6 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
107 } 125 }
108} 126}
109 127
110#else
111
112static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { }
113
114#endif 128#endif
115 129
116/* functions for op_amd_spec */ 130/* functions for op_amd_spec */
@@ -122,18 +136,12 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
122 for (i = 0; i < NUM_COUNTERS; i++) { 136 for (i = 0; i < NUM_COUNTERS; i++) {
123 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) 137 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
124 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; 138 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
125 else
126 msrs->counters[i].addr = 0;
127 } 139 }
128 140
129 for (i = 0; i < NUM_CONTROLS; i++) { 141 for (i = 0; i < NUM_CONTROLS; i++) {
130 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) 142 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
131 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; 143 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
132 else
133 msrs->controls[i].addr = 0;
134 } 144 }
135
136 op_mux_fill_in_addresses(msrs);
137} 145}
138 146
139static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, 147static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
@@ -144,7 +152,8 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
144 152
145 /* setup reset_value */ 153 /* setup reset_value */
146 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { 154 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
147 if (counter_config[i].enabled) 155 if (counter_config[i].enabled
156 && msrs->counters[op_x86_virt_to_phys(i)].addr)
148 reset_value[i] = counter_config[i].count; 157 reset_value[i] = counter_config[i].count;
149 else 158 else
150 reset_value[i] = 0; 159 reset_value[i] = 0;
@@ -152,9 +161,18 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
152 161
153 /* clear all counters */ 162 /* clear all counters */
154 for (i = 0; i < NUM_CONTROLS; ++i) { 163 for (i = 0; i < NUM_CONTROLS; ++i) {
155 if (unlikely(!msrs->controls[i].addr)) 164 if (unlikely(!msrs->controls[i].addr)) {
165 if (counter_config[i].enabled && !smp_processor_id())
166 /*
167 * counter is reserved, this is on all
168 * cpus, so report only for cpu #0
169 */
170 op_x86_warn_reserved(i);
156 continue; 171 continue;
172 }
157 rdmsrl(msrs->controls[i].addr, val); 173 rdmsrl(msrs->controls[i].addr, val);
174 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
175 op_x86_warn_in_use(i);
158 val &= model->reserved; 176 val &= model->reserved;
159 wrmsrl(msrs->controls[i].addr, val); 177 wrmsrl(msrs->controls[i].addr, val);
160 } 178 }
@@ -169,9 +187,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
169 /* enable active counters */ 187 /* enable active counters */
170 for (i = 0; i < NUM_COUNTERS; ++i) { 188 for (i = 0; i < NUM_COUNTERS; ++i) {
171 int virt = op_x86_phys_to_virt(i); 189 int virt = op_x86_phys_to_virt(i);
172 if (!counter_config[virt].enabled) 190 if (!reset_value[virt])
173 continue;
174 if (!msrs->counters[i].addr)
175 continue; 191 continue;
176 192
177 /* setup counter registers */ 193 /* setup counter registers */
@@ -185,7 +201,60 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
185 } 201 }
186} 202}
187 203
188#ifdef CONFIG_OPROFILE_IBS 204/*
205 * 16-bit Linear Feedback Shift Register (LFSR)
206 *
207 * 16 14 13 11
208 * Feedback polynomial = X + X + X + X + 1
209 */
210static unsigned int lfsr_random(void)
211{
212 static unsigned int lfsr_value = 0xF00D;
213 unsigned int bit;
214
215 /* Compute next bit to shift in */
216 bit = ((lfsr_value >> 0) ^
217 (lfsr_value >> 2) ^
218 (lfsr_value >> 3) ^
219 (lfsr_value >> 5)) & 0x0001;
220
221 /* Advance to next register value */
222 lfsr_value = (lfsr_value >> 1) | (bit << 15);
223
224 return lfsr_value;
225}
226
227/*
228 * IBS software randomization
229 *
230 * The IBS periodic op counter is randomized in software. The lower 12
231 * bits of the 20 bit counter are randomized. IbsOpCurCnt is
232 * initialized with a 12 bit random value.
233 */
234static inline u64 op_amd_randomize_ibs_op(u64 val)
235{
236 unsigned int random = lfsr_random();
237
238 if (!(ibs_caps & IBS_CAPS_RDWROPCNT))
239 /*
240 * Work around if the hw can not write to IbsOpCurCnt
241 *
242 * Randomize the lower 8 bits of the 16 bit
243 * IbsOpMaxCnt [15:0] value in the range of -128 to
244 * +127 by adding/subtracting an offset to the
245 * maximum count (IbsOpMaxCnt).
246 *
247 * To avoid over or underflows and protect upper bits
248 * starting at bit 16, the initial value for
249 * IbsOpMaxCnt must fit in the range from 0x0081 to
250 * 0xff80.
251 */
252 val += (s8)(random >> 4);
253 else
254 val |= (u64)(random & IBS_RANDOM_MASK) << 32;
255
256 return val;
257}
189 258
190static inline void 259static inline void
191op_amd_handle_ibs(struct pt_regs * const regs, 260op_amd_handle_ibs(struct pt_regs * const regs,
@@ -194,7 +263,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
194 u64 val, ctl; 263 u64 val, ctl;
195 struct op_entry entry; 264 struct op_entry entry;
196 265
197 if (!has_ibs) 266 if (!ibs_caps)
198 return; 267 return;
199 268
200 if (ibs_config.fetch_enabled) { 269 if (ibs_config.fetch_enabled) {
@@ -210,7 +279,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
210 oprofile_write_commit(&entry); 279 oprofile_write_commit(&entry);
211 280
212 /* reenable the IRQ */ 281 /* reenable the IRQ */
213 ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK); 282 ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT);
214 ctl |= IBS_FETCH_ENABLE; 283 ctl |= IBS_FETCH_ENABLE;
215 wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); 284 wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
216 } 285 }
@@ -236,8 +305,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
236 oprofile_write_commit(&entry); 305 oprofile_write_commit(&entry);
237 306
238 /* reenable the IRQ */ 307 /* reenable the IRQ */
239 ctl &= ~IBS_OP_VAL & 0xFFFFFFFF; 308 ctl = op_amd_randomize_ibs_op(ibs_op_ctl);
240 ctl |= IBS_OP_ENABLE;
241 wrmsrl(MSR_AMD64_IBSOPCTL, ctl); 309 wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
242 } 310 }
243 } 311 }
@@ -246,41 +314,57 @@ op_amd_handle_ibs(struct pt_regs * const regs,
246static inline void op_amd_start_ibs(void) 314static inline void op_amd_start_ibs(void)
247{ 315{
248 u64 val; 316 u64 val;
249 if (has_ibs && ibs_config.fetch_enabled) { 317
250 val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; 318 if (!ibs_caps)
319 return;
320
321 if (ibs_config.fetch_enabled) {
322 val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT;
251 val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; 323 val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
252 val |= IBS_FETCH_ENABLE; 324 val |= IBS_FETCH_ENABLE;
253 wrmsrl(MSR_AMD64_IBSFETCHCTL, val); 325 wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
254 } 326 }
255 327
256 if (has_ibs && ibs_config.op_enabled) { 328 if (ibs_config.op_enabled) {
257 val = (ibs_config.max_cnt_op >> 4) & 0xFFFF; 329 ibs_op_ctl = ibs_config.max_cnt_op >> 4;
258 val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; 330 if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) {
259 val |= IBS_OP_ENABLE; 331 /*
332 * IbsOpCurCnt not supported. See
333 * op_amd_randomize_ibs_op() for details.
334 */
335 ibs_op_ctl = clamp(ibs_op_ctl, 0x0081ULL, 0xFF80ULL);
336 } else {
337 /*
338 * The start value is randomized with a
339 * positive offset, we need to compensate it
340 * with the half of the randomized range. Also
341 * avoid underflows.
342 */
343 ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET,
344 IBS_OP_MAX_CNT);
345 }
346 if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops)
347 ibs_op_ctl |= IBS_OP_CNT_CTL;
348 ibs_op_ctl |= IBS_OP_ENABLE;
349 val = op_amd_randomize_ibs_op(ibs_op_ctl);
260 wrmsrl(MSR_AMD64_IBSOPCTL, val); 350 wrmsrl(MSR_AMD64_IBSOPCTL, val);
261 } 351 }
262} 352}
263 353
264static void op_amd_stop_ibs(void) 354static void op_amd_stop_ibs(void)
265{ 355{
266 if (has_ibs && ibs_config.fetch_enabled) 356 if (!ibs_caps)
357 return;
358
359 if (ibs_config.fetch_enabled)
267 /* clear max count and enable */ 360 /* clear max count and enable */
268 wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); 361 wrmsrl(MSR_AMD64_IBSFETCHCTL, 0);
269 362
270 if (has_ibs && ibs_config.op_enabled) 363 if (ibs_config.op_enabled)
271 /* clear max count and enable */ 364 /* clear max count and enable */
272 wrmsrl(MSR_AMD64_IBSOPCTL, 0); 365 wrmsrl(MSR_AMD64_IBSOPCTL, 0);
273} 366}
274 367
275#else
276
277static inline void op_amd_handle_ibs(struct pt_regs * const regs,
278 struct op_msrs const * const msrs) { }
279static inline void op_amd_start_ibs(void) { }
280static inline void op_amd_stop_ibs(void) { }
281
282#endif
283
284static int op_amd_check_ctrs(struct pt_regs * const regs, 368static int op_amd_check_ctrs(struct pt_regs * const regs,
285 struct op_msrs const * const msrs) 369 struct op_msrs const * const msrs)
286{ 370{
@@ -314,7 +398,7 @@ static void op_amd_start(struct op_msrs const * const msrs)
314 if (!reset_value[op_x86_phys_to_virt(i)]) 398 if (!reset_value[op_x86_phys_to_virt(i)])
315 continue; 399 continue;
316 rdmsrl(msrs->controls[i].addr, val); 400 rdmsrl(msrs->controls[i].addr, val);
317 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 401 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
318 wrmsrl(msrs->controls[i].addr, val); 402 wrmsrl(msrs->controls[i].addr, val);
319 } 403 }
320 404
@@ -334,7 +418,7 @@ static void op_amd_stop(struct op_msrs const * const msrs)
334 if (!reset_value[op_x86_phys_to_virt(i)]) 418 if (!reset_value[op_x86_phys_to_virt(i)])
335 continue; 419 continue;
336 rdmsrl(msrs->controls[i].addr, val); 420 rdmsrl(msrs->controls[i].addr, val);
337 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 421 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
338 wrmsrl(msrs->controls[i].addr, val); 422 wrmsrl(msrs->controls[i].addr, val);
339 } 423 }
340 424
@@ -355,8 +439,6 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
355 } 439 }
356} 440}
357 441
358#ifdef CONFIG_OPROFILE_IBS
359
360static u8 ibs_eilvt_off; 442static u8 ibs_eilvt_off;
361 443
362static inline void apic_init_ibs_nmi_per_cpu(void *arg) 444static inline void apic_init_ibs_nmi_per_cpu(void *arg)
@@ -405,45 +487,36 @@ static int init_ibs_nmi(void)
405 return 1; 487 return 1;
406 } 488 }
407 489
408#ifdef CONFIG_NUMA
409 /* Sanity check */
410 /* Works only for 64bit with proper numa implementation. */
411 if (nodes != num_possible_nodes()) {
412 printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, "
413 "found: %d, expected %d",
414 nodes, num_possible_nodes());
415 return 1;
416 }
417#endif
418 return 0; 490 return 0;
419} 491}
420 492
421/* uninitialize the APIC for the IBS interrupts if needed */ 493/* uninitialize the APIC for the IBS interrupts if needed */
422static void clear_ibs_nmi(void) 494static void clear_ibs_nmi(void)
423{ 495{
424 if (has_ibs) 496 if (ibs_caps)
425 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); 497 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
426} 498}
427 499
428/* initialize the APIC for the IBS interrupts if available */ 500/* initialize the APIC for the IBS interrupts if available */
429static void ibs_init(void) 501static void ibs_init(void)
430{ 502{
431 has_ibs = boot_cpu_has(X86_FEATURE_IBS); 503 ibs_caps = get_ibs_caps();
432 504
433 if (!has_ibs) 505 if (!ibs_caps)
434 return; 506 return;
435 507
436 if (init_ibs_nmi()) { 508 if (init_ibs_nmi()) {
437 has_ibs = 0; 509 ibs_caps = 0;
438 return; 510 return;
439 } 511 }
440 512
441 printk(KERN_INFO "oprofile: AMD IBS detected\n"); 513 printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n",
514 (unsigned)ibs_caps);
442} 515}
443 516
444static void ibs_exit(void) 517static void ibs_exit(void)
445{ 518{
446 if (!has_ibs) 519 if (!ibs_caps)
447 return; 520 return;
448 521
449 clear_ibs_nmi(); 522 clear_ibs_nmi();
@@ -463,7 +536,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
463 if (ret) 536 if (ret)
464 return ret; 537 return ret;
465 538
466 if (!has_ibs) 539 if (!ibs_caps)
467 return ret; 540 return ret;
468 541
469 /* model specific files */ 542 /* model specific files */
@@ -473,7 +546,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
473 ibs_config.fetch_enabled = 0; 546 ibs_config.fetch_enabled = 0;
474 ibs_config.max_cnt_op = 250000; 547 ibs_config.max_cnt_op = 250000;
475 ibs_config.op_enabled = 0; 548 ibs_config.op_enabled = 0;
476 ibs_config.dispatched_ops = 1; 549 ibs_config.dispatched_ops = 0;
477 550
478 dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); 551 dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
479 oprofilefs_create_ulong(sb, dir, "enable", 552 oprofilefs_create_ulong(sb, dir, "enable",
@@ -488,8 +561,9 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
488 &ibs_config.op_enabled); 561 &ibs_config.op_enabled);
489 oprofilefs_create_ulong(sb, dir, "max_count", 562 oprofilefs_create_ulong(sb, dir, "max_count",
490 &ibs_config.max_cnt_op); 563 &ibs_config.max_cnt_op);
491 oprofilefs_create_ulong(sb, dir, "dispatched_ops", 564 if (ibs_caps & IBS_CAPS_OPCNT)
492 &ibs_config.dispatched_ops); 565 oprofilefs_create_ulong(sb, dir, "dispatched_ops",
566 &ibs_config.dispatched_ops);
493 567
494 return 0; 568 return 0;
495} 569}
@@ -507,19 +581,6 @@ static void op_amd_exit(void)
507 ibs_exit(); 581 ibs_exit();
508} 582}
509 583
510#else
511
512/* no IBS support */
513
514static int op_amd_init(struct oprofile_operations *ops)
515{
516 return 0;
517}
518
519static void op_amd_exit(void) {}
520
521#endif /* CONFIG_OPROFILE_IBS */
522
523struct op_x86_model_spec op_amd_spec = { 584struct op_x86_model_spec op_amd_spec = {
524 .num_counters = NUM_COUNTERS, 585 .num_counters = NUM_COUNTERS,
525 .num_controls = NUM_CONTROLS, 586 .num_controls = NUM_CONTROLS,
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index ac6b354becdf..e6a160a4684a 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -394,12 +394,6 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
394 setup_num_counters(); 394 setup_num_counters();
395 stag = get_stagger(); 395 stag = get_stagger();
396 396
397 /* initialize some registers */
398 for (i = 0; i < num_counters; ++i)
399 msrs->counters[i].addr = 0;
400 for (i = 0; i < num_controls; ++i)
401 msrs->controls[i].addr = 0;
402
403 /* the counter & cccr registers we pay attention to */ 397 /* the counter & cccr registers we pay attention to */
404 for (i = 0; i < num_counters; ++i) { 398 for (i = 0; i < num_counters; ++i) {
405 addr = p4_counters[VIRT_CTR(stag, i)].counter_address; 399 addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 8eb05878554c..2bf90fafa7b5 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -37,15 +37,11 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs)
37 for (i = 0; i < num_counters; i++) { 37 for (i = 0; i < num_counters; i++) {
38 if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) 38 if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
39 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; 39 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
40 else
41 msrs->counters[i].addr = 0;
42 } 40 }
43 41
44 for (i = 0; i < num_counters; i++) { 42 for (i = 0; i < num_counters; i++) {
45 if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) 43 if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i))
46 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; 44 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
47 else
48 msrs->controls[i].addr = 0;
49 } 45 }
50} 46}
51 47
@@ -57,7 +53,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
57 int i; 53 int i;
58 54
59 if (!reset_value) { 55 if (!reset_value) {
60 reset_value = kmalloc(sizeof(reset_value[0]) * num_counters, 56 reset_value = kzalloc(sizeof(reset_value[0]) * num_counters,
61 GFP_ATOMIC); 57 GFP_ATOMIC);
62 if (!reset_value) 58 if (!reset_value)
63 return; 59 return;
@@ -82,9 +78,18 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
82 78
83 /* clear all counters */ 79 /* clear all counters */
84 for (i = 0; i < num_counters; ++i) { 80 for (i = 0; i < num_counters; ++i) {
85 if (unlikely(!msrs->controls[i].addr)) 81 if (unlikely(!msrs->controls[i].addr)) {
82 if (counter_config[i].enabled && !smp_processor_id())
83 /*
84 * counter is reserved, this is on all
85 * cpus, so report only for cpu #0
86 */
87 op_x86_warn_reserved(i);
86 continue; 88 continue;
89 }
87 rdmsrl(msrs->controls[i].addr, val); 90 rdmsrl(msrs->controls[i].addr, val);
91 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
92 op_x86_warn_in_use(i);
88 val &= model->reserved; 93 val &= model->reserved;
89 wrmsrl(msrs->controls[i].addr, val); 94 wrmsrl(msrs->controls[i].addr, val);
90 } 95 }
@@ -161,7 +166,7 @@ static void ppro_start(struct op_msrs const * const msrs)
161 for (i = 0; i < num_counters; ++i) { 166 for (i = 0; i < num_counters; ++i) {
162 if (reset_value[i]) { 167 if (reset_value[i]) {
163 rdmsrl(msrs->controls[i].addr, val); 168 rdmsrl(msrs->controls[i].addr, val);
164 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 169 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
165 wrmsrl(msrs->controls[i].addr, val); 170 wrmsrl(msrs->controls[i].addr, val);
166 } 171 }
167 } 172 }
@@ -179,7 +184,7 @@ static void ppro_stop(struct op_msrs const * const msrs)
179 if (!reset_value[i]) 184 if (!reset_value[i])
180 continue; 185 continue;
181 rdmsrl(msrs->controls[i].addr, val); 186 rdmsrl(msrs->controls[i].addr, val);
182 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 187 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
183 wrmsrl(msrs->controls[i].addr, val); 188 wrmsrl(msrs->controls[i].addr, val);
184 } 189 }
185} 190}
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 7b8e75d16081..ff82a755edd4 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -57,6 +57,26 @@ struct op_x86_model_spec {
57 57
58struct op_counter_config; 58struct op_counter_config;
59 59
60static inline void op_x86_warn_in_use(int counter)
61{
62 /*
63 * The warning indicates an already running counter. If
64 * oprofile doesn't collect data, then try using a different
65 * performance counter on your platform to monitor the desired
66 * event. Delete counter #%d from the desired event by editing
67 * the /usr/share/oprofile/%s/<cpu>/events file. If the event
68 * cannot be monitored by any other counter, contact your
69 * hardware or BIOS vendor.
70 */
71 pr_warning("oprofile: counter #%d on cpu #%d may already be used\n",
72 counter, smp_processor_id());
73}
74
75static inline void op_x86_warn_reserved(int counter)
76{
77 pr_warning("oprofile: counter #%d is already reserved\n", counter);
78}
79
60extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, 80extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
61 struct op_counter_config *counter_config); 81 struct op_counter_config *counter_config);
62extern int op_x86_phys_to_virt(int phys); 82extern int op_x86_phys_to_virt(int phys);
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 39fba37f702f..b110d97fb925 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -13,9 +13,10 @@ obj-$(CONFIG_X86_VISWS) += visws.o
13 13
14obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 14obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
15 15
16obj-$(CONFIG_X86_MRST) += mrst.o
17
16obj-y += common.o early.o 18obj-y += common.o early.o
17obj-y += amd_bus.o 19obj-y += amd_bus.o bus_numa.o
18obj-$(CONFIG_X86_64) += bus_numa.o
19 20
20ifeq ($(CONFIG_PCI_DEBUG),y) 21ifeq ($(CONFIG_PCI_DEBUG),y)
21EXTRA_CFLAGS += -DDEBUG 22EXTRA_CFLAGS += -DDEBUG
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 959e548a7039..6e22454bfaa6 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -15,6 +15,51 @@ struct pci_root_info {
15 int busnum; 15 int busnum;
16}; 16};
17 17
18static bool pci_use_crs = true;
19
20static int __init set_use_crs(const struct dmi_system_id *id)
21{
22 pci_use_crs = true;
23 return 0;
24}
25
26static const struct dmi_system_id pci_use_crs_table[] __initconst = {
27 /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */
28 {
29 .callback = set_use_crs,
30 .ident = "IBM System x3800",
31 .matches = {
32 DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
33 DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
34 },
35 },
36 {}
37};
38
39void __init pci_acpi_crs_quirks(void)
40{
41 int year;
42
43 if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008)
44 pci_use_crs = false;
45
46 dmi_check_system(pci_use_crs_table);
47
48 /*
49 * If the user specifies "pci=use_crs" or "pci=nocrs" explicitly, that
50 * takes precedence over anything we figured out above.
51 */
52 if (pci_probe & PCI_ROOT_NO_CRS)
53 pci_use_crs = false;
54 else if (pci_probe & PCI_USE__CRS)
55 pci_use_crs = true;
56
57 printk(KERN_INFO "PCI: %s host bridge windows from ACPI; "
58 "if necessary, use \"pci=%s\" and report a bug\n",
59 pci_use_crs ? "Using" : "Ignoring",
60 pci_use_crs ? "nocrs" : "use_crs");
61}
62
18static acpi_status 63static acpi_status
19resource_to_addr(struct acpi_resource *resource, 64resource_to_addr(struct acpi_resource *resource,
20 struct acpi_resource_address64 *addr) 65 struct acpi_resource_address64 *addr)
@@ -45,20 +90,6 @@ count_resource(struct acpi_resource *acpi_res, void *data)
45 return AE_OK; 90 return AE_OK;
46} 91}
47 92
48static int
49bus_has_transparent_bridge(struct pci_bus *bus)
50{
51 struct pci_dev *dev;
52
53 list_for_each_entry(dev, &bus->devices, bus_list) {
54 u16 class = dev->class >> 8;
55
56 if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent)
57 return true;
58 }
59 return false;
60}
61
62static void 93static void
63align_resource(struct acpi_device *bridge, struct resource *res) 94align_resource(struct acpi_device *bridge, struct resource *res)
64{ 95{
@@ -92,12 +123,8 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
92 acpi_status status; 123 acpi_status status;
93 unsigned long flags; 124 unsigned long flags;
94 struct resource *root; 125 struct resource *root;
95 int max_root_bus_resources = PCI_BUS_NUM_RESOURCES;
96 u64 start, end; 126 u64 start, end;
97 127
98 if (bus_has_transparent_bridge(info->bus))
99 max_root_bus_resources -= 3;
100
101 status = resource_to_addr(acpi_res, &addr); 128 status = resource_to_addr(acpi_res, &addr);
102 if (!ACPI_SUCCESS(status)) 129 if (!ACPI_SUCCESS(status))
103 return AE_OK; 130 return AE_OK;
@@ -115,15 +142,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
115 142
116 start = addr.minimum + addr.translation_offset; 143 start = addr.minimum + addr.translation_offset;
117 end = start + addr.address_length - 1; 144 end = start + addr.address_length - 1;
118 if (info->res_num >= max_root_bus_resources) {
119 if (pci_probe & PCI_USE__CRS)
120 printk(KERN_WARNING "PCI: Failed to allocate "
121 "0x%lx-0x%lx from %s for %s due to _CRS "
122 "returning more than %d resource descriptors\n",
123 (unsigned long) start, (unsigned long) end,
124 root->name, info->name, max_root_bus_resources);
125 return AE_OK;
126 }
127 145
128 res = &info->res[info->res_num]; 146 res = &info->res[info->res_num];
129 res->name = info->name; 147 res->name = info->name;
@@ -133,7 +151,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
133 res->child = NULL; 151 res->child = NULL;
134 align_resource(info->bridge, res); 152 align_resource(info->bridge, res);
135 153
136 if (!(pci_probe & PCI_USE__CRS)) { 154 if (!pci_use_crs) {
137 dev_printk(KERN_DEBUG, &info->bridge->dev, 155 dev_printk(KERN_DEBUG, &info->bridge->dev,
138 "host bridge window %pR (ignored)\n", res); 156 "host bridge window %pR (ignored)\n", res);
139 return AE_OK; 157 return AE_OK;
@@ -143,7 +161,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
143 dev_err(&info->bridge->dev, 161 dev_err(&info->bridge->dev,
144 "can't allocate host bridge window %pR\n", res); 162 "can't allocate host bridge window %pR\n", res);
145 } else { 163 } else {
146 info->bus->resource[info->res_num] = res; 164 pci_bus_add_resource(info->bus, res, 0);
147 info->res_num++; 165 info->res_num++;
148 if (addr.translation_offset) 166 if (addr.translation_offset)
149 dev_info(&info->bridge->dev, "host bridge window %pR " 167 dev_info(&info->bridge->dev, "host bridge window %pR "
@@ -164,10 +182,8 @@ get_current_resources(struct acpi_device *device, int busnum,
164 struct pci_root_info info; 182 struct pci_root_info info;
165 size_t size; 183 size_t size;
166 184
167 if (!(pci_probe & PCI_USE__CRS)) 185 if (pci_use_crs)
168 dev_info(&device->dev, 186 pci_bus_remove_resources(bus);
169 "ignoring host bridge windows from ACPI; "
170 "boot with \"pci=use_crs\" to use them\n");
171 187
172 info.bridge = device; 188 info.bridge = device;
173 info.bus = bus; 189 info.bus = bus;
@@ -282,17 +298,14 @@ int __init pci_acpi_init(void)
282{ 298{
283 struct pci_dev *dev = NULL; 299 struct pci_dev *dev = NULL;
284 300
285 if (pcibios_scanned)
286 return 0;
287
288 if (acpi_noirq) 301 if (acpi_noirq)
289 return 0; 302 return -ENODEV;
290 303
291 printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n"); 304 printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n");
292 acpi_irq_penalty_init(); 305 acpi_irq_penalty_init();
293 pcibios_scanned++;
294 pcibios_enable_irq = acpi_pci_irq_enable; 306 pcibios_enable_irq = acpi_pci_irq_enable;
295 pcibios_disable_irq = acpi_pci_irq_disable; 307 pcibios_disable_irq = acpi_pci_irq_disable;
308 x86_init.pci.init_irq = x86_init_noop;
296 309
297 if (pci_routeirq) { 310 if (pci_routeirq) {
298 /* 311 /*
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 95ecbd495955..fc1e8fe07e5c 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -2,11 +2,11 @@
2#include <linux/pci.h> 2#include <linux/pci.h>
3#include <linux/topology.h> 3#include <linux/topology.h>
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/range.h>
6
5#include <asm/pci_x86.h> 7#include <asm/pci_x86.h>
6 8
7#ifdef CONFIG_X86_64
8#include <asm/pci-direct.h> 9#include <asm/pci-direct.h>
9#endif
10 10
11#include "bus_numa.h" 11#include "bus_numa.h"
12 12
@@ -15,60 +15,6 @@
15 * also get peer root bus resource for io,mmio 15 * also get peer root bus resource for io,mmio
16 */ 16 */
17 17
18#ifdef CONFIG_X86_64
19
20#define RANGE_NUM 16
21
22struct res_range {
23 size_t start;
24 size_t end;
25};
26
27static void __init update_range(struct res_range *range, size_t start,
28 size_t end)
29{
30 int i;
31 int j;
32
33 for (j = 0; j < RANGE_NUM; j++) {
34 if (!range[j].end)
35 continue;
36
37 if (start <= range[j].start && end >= range[j].end) {
38 range[j].start = 0;
39 range[j].end = 0;
40 continue;
41 }
42
43 if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) {
44 range[j].start = end + 1;
45 continue;
46 }
47
48
49 if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) {
50 range[j].end = start - 1;
51 continue;
52 }
53
54 if (start > range[j].start && end < range[j].end) {
55 /* find the new spare */
56 for (i = 0; i < RANGE_NUM; i++) {
57 if (range[i].end == 0)
58 break;
59 }
60 if (i < RANGE_NUM) {
61 range[i].end = range[j].end;
62 range[i].start = end + 1;
63 } else {
64 printk(KERN_ERR "run of slot in ranges\n");
65 }
66 range[j].end = start - 1;
67 continue;
68 }
69 }
70}
71
72struct pci_hostbridge_probe { 18struct pci_hostbridge_probe {
73 u32 bus; 19 u32 bus;
74 u32 slot; 20 u32 slot;
@@ -111,6 +57,8 @@ static void __init get_pci_mmcfg_amd_fam10h_range(void)
111 fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1; 57 fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
112} 58}
113 59
60#define RANGE_NUM 16
61
114/** 62/**
115 * early_fill_mp_bus_to_node() 63 * early_fill_mp_bus_to_node()
116 * called before pcibios_scan_root and pci_scan_bus 64 * called before pcibios_scan_root and pci_scan_bus
@@ -130,16 +78,17 @@ static int __init early_fill_mp_bus_info(void)
130 struct pci_root_info *info; 78 struct pci_root_info *info;
131 u32 reg; 79 u32 reg;
132 struct resource *res; 80 struct resource *res;
133 size_t start; 81 u64 start;
134 size_t end; 82 u64 end;
135 struct res_range range[RANGE_NUM]; 83 struct range range[RANGE_NUM];
136 u64 val; 84 u64 val;
137 u32 address; 85 u32 address;
86 bool found;
138 87
139 if (!early_pci_allowed()) 88 if (!early_pci_allowed())
140 return -1; 89 return -1;
141 90
142 found_all_numa_early = 0; 91 found = false;
143 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { 92 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
144 u32 id; 93 u32 id;
145 u16 device; 94 u16 device;
@@ -153,12 +102,12 @@ static int __init early_fill_mp_bus_info(void)
153 device = (id>>16) & 0xffff; 102 device = (id>>16) & 0xffff;
154 if (pci_probes[i].vendor == vendor && 103 if (pci_probes[i].vendor == vendor &&
155 pci_probes[i].device == device) { 104 pci_probes[i].device == device) {
156 found_all_numa_early = 1; 105 found = true;
157 break; 106 break;
158 } 107 }
159 } 108 }
160 109
161 if (!found_all_numa_early) 110 if (!found)
162 return 0; 111 return 0;
163 112
164 pci_root_num = 0; 113 pci_root_num = 0;
@@ -196,7 +145,7 @@ static int __init early_fill_mp_bus_info(void)
196 def_link = (reg >> 8) & 0x03; 145 def_link = (reg >> 8) & 0x03;
197 146
198 memset(range, 0, sizeof(range)); 147 memset(range, 0, sizeof(range));
199 range[0].end = 0xffff; 148 add_range(range, RANGE_NUM, 0, 0, 0xffff + 1);
200 /* io port resource */ 149 /* io port resource */
201 for (i = 0; i < 4; i++) { 150 for (i = 0; i < 4; i++) {
202 reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3)); 151 reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3));
@@ -220,13 +169,13 @@ static int __init early_fill_mp_bus_info(void)
220 169
221 info = &pci_root_info[j]; 170 info = &pci_root_info[j];
222 printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n", 171 printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n",
223 node, link, (u64)start, (u64)end); 172 node, link, start, end);
224 173
225 /* kernel only handle 16 bit only */ 174 /* kernel only handle 16 bit only */
226 if (end > 0xffff) 175 if (end > 0xffff)
227 end = 0xffff; 176 end = 0xffff;
228 update_res(info, start, end, IORESOURCE_IO, 1); 177 update_res(info, start, end, IORESOURCE_IO, 1);
229 update_range(range, start, end); 178 subtract_range(range, RANGE_NUM, start, end + 1);
230 } 179 }
231 /* add left over io port range to def node/link, [0, 0xffff] */ 180 /* add left over io port range to def node/link, [0, 0xffff] */
232 /* find the position */ 181 /* find the position */
@@ -241,29 +190,32 @@ static int __init early_fill_mp_bus_info(void)
241 if (!range[i].end) 190 if (!range[i].end)
242 continue; 191 continue;
243 192
244 update_res(info, range[i].start, range[i].end, 193 update_res(info, range[i].start, range[i].end - 1,
245 IORESOURCE_IO, 1); 194 IORESOURCE_IO, 1);
246 } 195 }
247 } 196 }
248 197
249 memset(range, 0, sizeof(range)); 198 memset(range, 0, sizeof(range));
250 /* 0xfd00000000-0xffffffffff for HT */ 199 /* 0xfd00000000-0xffffffffff for HT */
251 range[0].end = (0xfdULL<<32) - 1; 200 end = cap_resource((0xfdULL<<32) - 1);
201 end++;
202 add_range(range, RANGE_NUM, 0, 0, end);
252 203
253 /* need to take out [0, TOM) for RAM*/ 204 /* need to take out [0, TOM) for RAM*/
254 address = MSR_K8_TOP_MEM1; 205 address = MSR_K8_TOP_MEM1;
255 rdmsrl(address, val); 206 rdmsrl(address, val);
256 end = (val & 0xffffff800000ULL); 207 end = (val & 0xffffff800000ULL);
257 printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); 208 printk(KERN_INFO "TOM: %016llx aka %lldM\n", end, end>>20);
258 if (end < (1ULL<<32)) 209 if (end < (1ULL<<32))
259 update_range(range, 0, end - 1); 210 subtract_range(range, RANGE_NUM, 0, end);
260 211
261 /* get mmconfig */ 212 /* get mmconfig */
262 get_pci_mmcfg_amd_fam10h_range(); 213 get_pci_mmcfg_amd_fam10h_range();
263 /* need to take out mmconf range */ 214 /* need to take out mmconf range */
264 if (fam10h_mmconf_end) { 215 if (fam10h_mmconf_end) {
265 printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); 216 printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end);
266 update_range(range, fam10h_mmconf_start, fam10h_mmconf_end); 217 subtract_range(range, RANGE_NUM, fam10h_mmconf_start,
218 fam10h_mmconf_end + 1);
267 } 219 }
268 220
269 /* mmio resource */ 221 /* mmio resource */
@@ -293,7 +245,7 @@ static int __init early_fill_mp_bus_info(void)
293 info = &pci_root_info[j]; 245 info = &pci_root_info[j];
294 246
295 printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]", 247 printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]",
296 node, link, (u64)start, (u64)end); 248 node, link, start, end);
297 /* 249 /*
298 * some sick allocation would have range overlap with fam10h 250 * some sick allocation would have range overlap with fam10h
299 * mmconf range, so need to update start and end. 251 * mmconf range, so need to update start and end.
@@ -318,14 +270,15 @@ static int __init early_fill_mp_bus_info(void)
318 /* we got a hole */ 270 /* we got a hole */
319 endx = fam10h_mmconf_start - 1; 271 endx = fam10h_mmconf_start - 1;
320 update_res(info, start, endx, IORESOURCE_MEM, 0); 272 update_res(info, start, endx, IORESOURCE_MEM, 0);
321 update_range(range, start, endx); 273 subtract_range(range, RANGE_NUM, start,
322 printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx); 274 endx + 1);
275 printk(KERN_CONT " ==> [%llx, %llx]", start, endx);
323 start = fam10h_mmconf_end + 1; 276 start = fam10h_mmconf_end + 1;
324 changed = 1; 277 changed = 1;
325 } 278 }
326 if (changed) { 279 if (changed) {
327 if (start <= end) { 280 if (start <= end) {
328 printk(KERN_CONT " %s [%llx, %llx]", endx?"and":"==>", (u64)start, (u64)end); 281 printk(KERN_CONT " %s [%llx, %llx]", endx ? "and" : "==>", start, end);
329 } else { 282 } else {
330 printk(KERN_CONT "%s\n", endx?"":" ==> none"); 283 printk(KERN_CONT "%s\n", endx?"":" ==> none");
331 continue; 284 continue;
@@ -333,8 +286,9 @@ static int __init early_fill_mp_bus_info(void)
333 } 286 }
334 } 287 }
335 288
336 update_res(info, start, end, IORESOURCE_MEM, 1); 289 update_res(info, cap_resource(start), cap_resource(end),
337 update_range(range, start, end); 290 IORESOURCE_MEM, 1);
291 subtract_range(range, RANGE_NUM, start, end + 1);
338 printk(KERN_CONT "\n"); 292 printk(KERN_CONT "\n");
339 } 293 }
340 294
@@ -348,8 +302,8 @@ static int __init early_fill_mp_bus_info(void)
348 address = MSR_K8_TOP_MEM2; 302 address = MSR_K8_TOP_MEM2;
349 rdmsrl(address, val); 303 rdmsrl(address, val);
350 end = (val & 0xffffff800000ULL); 304 end = (val & 0xffffff800000ULL);
351 printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); 305 printk(KERN_INFO "TOM2: %016llx aka %lldM\n", end, end>>20);
352 update_range(range, 1ULL<<32, end - 1); 306 subtract_range(range, RANGE_NUM, 1ULL<<32, end);
353 } 307 }
354 308
355 /* 309 /*
@@ -368,7 +322,8 @@ static int __init early_fill_mp_bus_info(void)
368 if (!range[i].end) 322 if (!range[i].end)
369 continue; 323 continue;
370 324
371 update_res(info, range[i].start, range[i].end, 325 update_res(info, cap_resource(range[i].start),
326 cap_resource(range[i].end - 1),
372 IORESOURCE_MEM, 1); 327 IORESOURCE_MEM, 1);
373 } 328 }
374 } 329 }
@@ -384,24 +339,14 @@ static int __init early_fill_mp_bus_info(void)
384 info->bus_min, info->bus_max, info->node, info->link); 339 info->bus_min, info->bus_max, info->node, info->link);
385 for (j = 0; j < res_num; j++) { 340 for (j = 0; j < res_num; j++) {
386 res = &info->res[j]; 341 res = &info->res[j];
387 printk(KERN_DEBUG "bus: %02x index %x %s: [%llx, %llx]\n", 342 printk(KERN_DEBUG "bus: %02x index %x %pR\n",
388 busnum, j, 343 busnum, j, res);
389 (res->flags & IORESOURCE_IO)?"io port":"mmio",
390 res->start, res->end);
391 } 344 }
392 } 345 }
393 346
394 return 0; 347 return 0;
395} 348}
396 349
397#else /* !CONFIG_X86_64 */
398
399static int __init early_fill_mp_bus_info(void) { return 0; }
400
401#endif /* !CONFIG_X86_64 */
402
403/* common 32/64 bit code */
404
405#define ENABLE_CF8_EXT_CFG (1ULL << 46) 350#define ENABLE_CF8_EXT_CFG (1ULL << 46)
406 351
407static void enable_pci_io_ecs(void *unused) 352static void enable_pci_io_ecs(void *unused)
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index f939d603adfa..64a122883896 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -1,11 +1,11 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/pci.h> 2#include <linux/pci.h>
3#include <linux/range.h>
3 4
4#include "bus_numa.h" 5#include "bus_numa.h"
5 6
6int pci_root_num; 7int pci_root_num;
7struct pci_root_info pci_root_info[PCI_ROOT_NR]; 8struct pci_root_info pci_root_info[PCI_ROOT_NR];
8int found_all_numa_early;
9 9
10void x86_pci_root_bus_res_quirks(struct pci_bus *b) 10void x86_pci_root_bus_res_quirks(struct pci_bus *b)
11{ 11{
@@ -21,10 +21,6 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
21 if (!pci_root_num) 21 if (!pci_root_num)
22 return; 22 return;
23 23
24 /* for amd, if only one root bus, don't need to do anything */
25 if (pci_root_num < 2 && found_all_numa_early)
26 return;
27
28 for (i = 0; i < pci_root_num; i++) { 24 for (i = 0; i < pci_root_num; i++) {
29 if (pci_root_info[i].bus_min == b->number) 25 if (pci_root_info[i].bus_min == b->number)
30 break; 26 break;
@@ -36,13 +32,14 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
36 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", 32 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
37 b->number); 33 b->number);
38 34
35 pci_bus_remove_resources(b);
39 info = &pci_root_info[i]; 36 info = &pci_root_info[i];
40 for (j = 0; j < info->res_num; j++) { 37 for (j = 0; j < info->res_num; j++) {
41 struct resource *res; 38 struct resource *res;
42 struct resource *root; 39 struct resource *root;
43 40
44 res = &info->res[j]; 41 res = &info->res[j];
45 b->resource[j] = res; 42 pci_bus_add_resource(b, res, 0);
46 if (res->flags & IORESOURCE_IO) 43 if (res->flags & IORESOURCE_IO)
47 root = &ioport_resource; 44 root = &ioport_resource;
48 else 45 else
@@ -51,8 +48,8 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
51 } 48 }
52} 49}
53 50
54void __devinit update_res(struct pci_root_info *info, size_t start, 51void __devinit update_res(struct pci_root_info *info, resource_size_t start,
55 size_t end, unsigned long flags, int merge) 52 resource_size_t end, unsigned long flags, int merge)
56{ 53{
57 int i; 54 int i;
58 struct resource *res; 55 struct resource *res;
@@ -60,25 +57,28 @@ void __devinit update_res(struct pci_root_info *info, size_t start,
60 if (start > end) 57 if (start > end)
61 return; 58 return;
62 59
60 if (start == MAX_RESOURCE)
61 return;
62
63 if (!merge) 63 if (!merge)
64 goto addit; 64 goto addit;
65 65
66 /* try to merge it with old one */ 66 /* try to merge it with old one */
67 for (i = 0; i < info->res_num; i++) { 67 for (i = 0; i < info->res_num; i++) {
68 size_t final_start, final_end; 68 resource_size_t final_start, final_end;
69 size_t common_start, common_end; 69 resource_size_t common_start, common_end;
70 70
71 res = &info->res[i]; 71 res = &info->res[i];
72 if (res->flags != flags) 72 if (res->flags != flags)
73 continue; 73 continue;
74 74
75 common_start = max((size_t)res->start, start); 75 common_start = max(res->start, start);
76 common_end = min((size_t)res->end, end); 76 common_end = min(res->end, end);
77 if (common_start > common_end + 1) 77 if (common_start > common_end + 1)
78 continue; 78 continue;
79 79
80 final_start = min((size_t)res->start, start); 80 final_start = min(res->start, start);
81 final_end = max((size_t)res->end, end); 81 final_end = max(res->end, end);
82 82
83 res->start = final_start; 83 res->start = final_start;
84 res->end = final_end; 84 res->end = final_end;
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
index adbc23fe82ac..804a4b40c31a 100644
--- a/arch/x86/pci/bus_numa.h
+++ b/arch/x86/pci/bus_numa.h
@@ -1,9 +1,8 @@
1#ifdef CONFIG_X86_64 1#ifndef __BUS_NUMA_H
2 2#define __BUS_NUMA_H
3/* 3/*
4 * sub bus (transparent) will use entres from 3 to store extra from 4 * sub bus (transparent) will use entres from 3 to store extra from
5 * root, so need to make sure we have enough slot there, Should we 5 * root, so need to make sure we have enough slot there.
6 * increase PCI_BUS_NUM_RESOURCES?
7 */ 6 */
8#define RES_NUM 16 7#define RES_NUM 16
9struct pci_root_info { 8struct pci_root_info {
@@ -20,8 +19,7 @@ struct pci_root_info {
20#define PCI_ROOT_NR 4 19#define PCI_ROOT_NR 4
21extern int pci_root_num; 20extern int pci_root_num;
22extern struct pci_root_info pci_root_info[PCI_ROOT_NR]; 21extern struct pci_root_info pci_root_info[PCI_ROOT_NR];
23extern int found_all_numa_early;
24 22
25extern void update_res(struct pci_root_info *info, size_t start, 23extern void update_res(struct pci_root_info *info, resource_size_t start,
26 size_t end, unsigned long flags, int merge); 24 resource_size_t end, unsigned long flags, int merge);
27#endif 25#endif
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index d2552c68e94d..294e10cb11e1 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -72,12 +72,6 @@ struct pci_ops pci_root_ops = {
72}; 72};
73 73
74/* 74/*
75 * legacy, numa, and acpi all want to call pcibios_scan_root
76 * from their initcalls. This flag prevents that.
77 */
78int pcibios_scanned;
79
80/*
81 * This interrupt-safe spinlock protects all accesses to PCI 75 * This interrupt-safe spinlock protects all accesses to PCI
82 * configuration space. 76 * configuration space.
83 */ 77 */
@@ -520,6 +514,9 @@ char * __devinit pcibios_setup(char *str)
520 } else if (!strcmp(str, "use_crs")) { 514 } else if (!strcmp(str, "use_crs")) {
521 pci_probe |= PCI_USE__CRS; 515 pci_probe |= PCI_USE__CRS;
522 return NULL; 516 return NULL;
517 } else if (!strcmp(str, "nocrs")) {
518 pci_probe |= PCI_ROOT_NO_CRS;
519 return NULL;
523 } else if (!strcmp(str, "earlydump")) { 520 } else if (!strcmp(str, "earlydump")) {
524 pci_early_dump_regs = 1; 521 pci_early_dump_regs = 1;
525 return NULL; 522 return NULL;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 5dc9e8c63fcd..dece3eb9c906 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -60,22 +60,20 @@ skip_isa_ioresource_align(struct pci_dev *dev) {
60 * but we want to try to avoid allocating at 0x2900-0x2bff 60 * but we want to try to avoid allocating at 0x2900-0x2bff
61 * which might have be mirrored at 0x0100-0x03ff.. 61 * which might have be mirrored at 0x0100-0x03ff..
62 */ 62 */
63void 63resource_size_t
64pcibios_align_resource(void *data, struct resource *res, 64pcibios_align_resource(void *data, const struct resource *res,
65 resource_size_t size, resource_size_t align) 65 resource_size_t size, resource_size_t align)
66{ 66{
67 struct pci_dev *dev = data; 67 struct pci_dev *dev = data;
68 resource_size_t start = res->start;
68 69
69 if (res->flags & IORESOURCE_IO) { 70 if (res->flags & IORESOURCE_IO) {
70 resource_size_t start = res->start;
71
72 if (skip_isa_ioresource_align(dev)) 71 if (skip_isa_ioresource_align(dev))
73 return; 72 return start;
74 if (start & 0x300) { 73 if (start & 0x300)
75 start = (start + 0x3ff) & ~0x3ff; 74 start = (start + 0x3ff) & ~0x3ff;
76 res->start = start;
77 }
78 } 75 }
76 return start;
79} 77}
80EXPORT_SYMBOL(pcibios_align_resource); 78EXPORT_SYMBOL(pcibios_align_resource);
81 79
@@ -257,10 +255,6 @@ void __init pcibios_resource_survey(void)
257 */ 255 */
258fs_initcall(pcibios_assign_resources); 256fs_initcall(pcibios_assign_resources);
259 257
260void __weak x86_pci_root_bus_res_quirks(struct pci_bus *b)
261{
262}
263
264/* 258/*
265 * If we set up a device for bus mastering, we need to check the latency 259 * If we set up a device for bus mastering, we need to check the latency
266 * timer as certain crappy BIOSes forget to set it properly. 260 * timer as certain crappy BIOSes forget to set it properly.
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index 25a1f8efed4a..adb62aaa7ecd 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -1,6 +1,7 @@
1#include <linux/pci.h> 1#include <linux/pci.h>
2#include <linux/init.h> 2#include <linux/init.h>
3#include <asm/pci_x86.h> 3#include <asm/pci_x86.h>
4#include <asm/x86_init.h>
4 5
5/* arch_initcall has too random ordering, so call the initializers 6/* arch_initcall has too random ordering, so call the initializers
6 in the right sequence from here. */ 7 in the right sequence from here. */
@@ -15,10 +16,9 @@ static __init int pci_arch_init(void)
15 if (!(pci_probe & PCI_PROBE_NOEARLY)) 16 if (!(pci_probe & PCI_PROBE_NOEARLY))
16 pci_mmcfg_early_init(); 17 pci_mmcfg_early_init();
17 18
18#ifdef CONFIG_PCI_OLPC 19 if (x86_init.pci.arch_init && !x86_init.pci.arch_init())
19 if (!pci_olpc_init()) 20 return 0;
20 return 0; /* skip additional checks if it's an XO */ 21
21#endif
22#ifdef CONFIG_PCI_BIOS 22#ifdef CONFIG_PCI_BIOS
23 pci_pcbios_init(); 23 pci_pcbios_init();
24#endif 24#endif
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 0696d506c4ad..8b107521d24e 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -53,7 +53,7 @@ struct irq_router_handler {
53 int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device); 53 int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
54}; 54};
55 55
56int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL; 56int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq;
57void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL; 57void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
58 58
59/* 59/*
@@ -590,6 +590,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
590 case PCI_DEVICE_ID_INTEL_ICH10_1: 590 case PCI_DEVICE_ID_INTEL_ICH10_1:
591 case PCI_DEVICE_ID_INTEL_ICH10_2: 591 case PCI_DEVICE_ID_INTEL_ICH10_2:
592 case PCI_DEVICE_ID_INTEL_ICH10_3: 592 case PCI_DEVICE_ID_INTEL_ICH10_3:
593 case PCI_DEVICE_ID_INTEL_CPT_LPC1:
594 case PCI_DEVICE_ID_INTEL_CPT_LPC2:
593 r->name = "PIIX/ICH"; 595 r->name = "PIIX/ICH";
594 r->get = pirq_piix_get; 596 r->get = pirq_piix_get;
595 r->set = pirq_piix_set; 597 r->set = pirq_piix_set;
@@ -1016,7 +1018,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
1016 return 1; 1018 return 1;
1017} 1019}
1018 1020
1019static void __init pcibios_fixup_irqs(void) 1021void __init pcibios_fixup_irqs(void)
1020{ 1022{
1021 struct pci_dev *dev = NULL; 1023 struct pci_dev *dev = NULL;
1022 u8 pin; 1024 u8 pin;
@@ -1110,12 +1112,12 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = {
1110 { } 1112 { }
1111}; 1113};
1112 1114
1113int __init pcibios_irq_init(void) 1115void __init pcibios_irq_init(void)
1114{ 1116{
1115 DBG(KERN_DEBUG "PCI: IRQ init\n"); 1117 DBG(KERN_DEBUG "PCI: IRQ init\n");
1116 1118
1117 if (pcibios_enable_irq || raw_pci_ops == NULL) 1119 if (raw_pci_ops == NULL)
1118 return 0; 1120 return;
1119 1121
1120 dmi_check_system(pciirq_dmi_table); 1122 dmi_check_system(pciirq_dmi_table);
1121 1123
@@ -1142,9 +1144,7 @@ int __init pcibios_irq_init(void)
1142 pirq_table = NULL; 1144 pirq_table = NULL;
1143 } 1145 }
1144 1146
1145 pcibios_enable_irq = pirq_enable_irq; 1147 x86_init.pci.fixup_irqs();
1146
1147 pcibios_fixup_irqs();
1148 1148
1149 if (io_apic_assign_pci_irqs && pci_routeirq) { 1149 if (io_apic_assign_pci_irqs && pci_routeirq) {
1150 struct pci_dev *dev = NULL; 1150 struct pci_dev *dev = NULL;
@@ -1157,8 +1157,6 @@ int __init pcibios_irq_init(void)
1157 for_each_pci_dev(dev) 1157 for_each_pci_dev(dev)
1158 pirq_enable_irq(dev); 1158 pirq_enable_irq(dev);
1159 } 1159 }
1160
1161 return 0;
1162} 1160}
1163 1161
1164static void pirq_penalize_isa_irq(int irq, int active) 1162static void pirq_penalize_isa_irq(int irq, int active)
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 4061bb0f267d..0db5eaf54560 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -35,16 +35,13 @@ static void __devinit pcibios_fixup_peer_bridges(void)
35 } 35 }
36} 36}
37 37
38static int __init pci_legacy_init(void) 38int __init pci_legacy_init(void)
39{ 39{
40 if (!raw_pci_ops) { 40 if (!raw_pci_ops) {
41 printk("PCI: System does not support PCI\n"); 41 printk("PCI: System does not support PCI\n");
42 return 0; 42 return 0;
43 } 43 }
44 44
45 if (pcibios_scanned++)
46 return 0;
47
48 printk("PCI: Probing PCI hardware\n"); 45 printk("PCI: Probing PCI hardware\n");
49 pci_root_bus = pcibios_scan_root(0); 46 pci_root_bus = pcibios_scan_root(0);
50 if (pci_root_bus) 47 if (pci_root_bus)
@@ -55,18 +52,15 @@ static int __init pci_legacy_init(void)
55 52
56int __init pci_subsys_init(void) 53int __init pci_subsys_init(void)
57{ 54{
58#ifdef CONFIG_X86_NUMAQ 55 /*
59 pci_numaq_init(); 56 * The init function returns an non zero value when
60#endif 57 * pci_legacy_init should be invoked.
61#ifdef CONFIG_ACPI 58 */
62 pci_acpi_init(); 59 if (x86_init.pci.init())
63#endif 60 pci_legacy_init();
64#ifdef CONFIG_X86_VISWS 61
65 pci_visws_init();
66#endif
67 pci_legacy_init();
68 pcibios_fixup_peer_bridges(); 62 pcibios_fixup_peer_bridges();
69 pcibios_irq_init(); 63 x86_init.pci.init_irq();
70 pcibios_init(); 64 pcibios_init();
71 65
72 return 0; 66 return 0;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index b19d1e54201e..8f3f9a50b1e0 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -303,22 +303,17 @@ static void __init pci_mmcfg_check_end_bus_number(void)
303{ 303{
304 struct pci_mmcfg_region *cfg, *cfgx; 304 struct pci_mmcfg_region *cfg, *cfgx;
305 305
306 /* last one*/ 306 /* Fixup overlaps */
307 cfg = list_entry(pci_mmcfg_list.prev, typeof(*cfg), list);
308 if (cfg)
309 if (cfg->end_bus < cfg->start_bus)
310 cfg->end_bus = 255;
311
312 if (list_is_singular(&pci_mmcfg_list))
313 return;
314
315 /* don't overlap please */
316 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 307 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
317 if (cfg->end_bus < cfg->start_bus) 308 if (cfg->end_bus < cfg->start_bus)
318 cfg->end_bus = 255; 309 cfg->end_bus = 255;
319 310
311 /* Don't access the list head ! */
312 if (cfg->list.next == &pci_mmcfg_list)
313 break;
314
320 cfgx = list_entry(cfg->list.next, typeof(*cfg), list); 315 cfgx = list_entry(cfg->list.next, typeof(*cfg), list);
321 if (cfg != cfgx && cfg->end_bus >= cfgx->start_bus) 316 if (cfg->end_bus >= cfgx->start_bus)
322 cfg->end_bus = cfgx->start_bus - 1; 317 cfg->end_bus = cfgx->start_bus - 1;
323 } 318 }
324} 319}
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
new file mode 100644
index 000000000000..8bf2fcb88d04
--- /dev/null
+++ b/arch/x86/pci/mrst.c
@@ -0,0 +1,262 @@
1/*
2 * Moorestown PCI support
3 * Copyright (c) 2008 Intel Corporation
4 * Jesse Barnes <jesse.barnes@intel.com>
5 *
6 * Moorestown has an interesting PCI implementation:
7 * - configuration space is memory mapped (as defined by MCFG)
8 * - Lincroft devices also have a real, type 1 configuration space
9 * - Early Lincroft silicon has a type 1 access bug that will cause
10 * a hang if non-existent devices are accessed
11 * - some devices have the "fixed BAR" capability, which means
12 * they can't be relocated or modified; check for that during
13 * BAR sizing
14 *
15 * So, we use the MCFG space for all reads and writes, but also send
16 * Lincroft writes to type 1 space. But only read/write if the device
17 * actually exists, otherwise return all 1s for reads and bit bucket
18 * the writes.
19 */
20
21#include <linux/sched.h>
22#include <linux/pci.h>
23#include <linux/ioport.h>
24#include <linux/init.h>
25#include <linux/dmi.h>
26
27#include <asm/acpi.h>
28#include <asm/segment.h>
29#include <asm/io.h>
30#include <asm/smp.h>
31#include <asm/pci_x86.h>
32#include <asm/hw_irq.h>
33#include <asm/io_apic.h>
34
35#define PCIE_CAP_OFFSET 0x100
36
37/* Fixed BAR fields */
38#define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00 /* Fixed BAR (TBD) */
39#define PCI_FIXED_BAR_0_SIZE 0x04
40#define PCI_FIXED_BAR_1_SIZE 0x08
41#define PCI_FIXED_BAR_2_SIZE 0x0c
42#define PCI_FIXED_BAR_3_SIZE 0x10
43#define PCI_FIXED_BAR_4_SIZE 0x14
44#define PCI_FIXED_BAR_5_SIZE 0x1c
45
46/**
47 * fixed_bar_cap - return the offset of the fixed BAR cap if found
48 * @bus: PCI bus
49 * @devfn: device in question
50 *
51 * Look for the fixed BAR cap on @bus and @devfn, returning its offset
52 * if found or 0 otherwise.
53 */
54static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn)
55{
56 int pos;
57 u32 pcie_cap = 0, cap_data;
58
59 pos = PCIE_CAP_OFFSET;
60
61 if (!raw_pci_ext_ops)
62 return 0;
63
64 while (pos) {
65 if (raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
66 devfn, pos, 4, &pcie_cap))
67 return 0;
68
69 if (pcie_cap == 0xffffffff)
70 return 0;
71
72 if (PCI_EXT_CAP_ID(pcie_cap) == PCI_EXT_CAP_ID_VNDR) {
73 raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
74 devfn, pos + 4, 4, &cap_data);
75 if ((cap_data & 0xffff) == PCIE_VNDR_CAP_ID_FIXED_BAR)
76 return pos;
77 }
78
79 pos = pcie_cap >> 20;
80 }
81
82 return 0;
83}
84
85static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn,
86 int reg, int len, u32 val, int offset)
87{
88 u32 size;
89 unsigned int domain, busnum;
90 int bar = (reg - PCI_BASE_ADDRESS_0) >> 2;
91
92 domain = pci_domain_nr(bus);
93 busnum = bus->number;
94
95 if (val == ~0 && len == 4) {
96 unsigned long decode;
97
98 raw_pci_ext_ops->read(domain, busnum, devfn,
99 offset + 8 + (bar * 4), 4, &size);
100
101 /* Turn the size into a decode pattern for the sizing code */
102 if (size) {
103 decode = size - 1;
104 decode |= decode >> 1;
105 decode |= decode >> 2;
106 decode |= decode >> 4;
107 decode |= decode >> 8;
108 decode |= decode >> 16;
109 decode++;
110 decode = ~(decode - 1);
111 } else {
112 decode = ~0;
113 }
114
115 /*
116 * If val is all ones, the core code is trying to size the reg,
117 * so update the mmconfig space with the real size.
118 *
119 * Note: this assumes the fixed size we got is a power of two.
120 */
121 return raw_pci_ext_ops->write(domain, busnum, devfn, reg, 4,
122 decode);
123 }
124
125 /* This is some other kind of BAR write, so just do it. */
126 return raw_pci_ext_ops->write(domain, busnum, devfn, reg, len, val);
127}
128
129/**
130 * type1_access_ok - check whether to use type 1
131 * @bus: bus number
132 * @devfn: device & function in question
133 *
134 * If the bus is on a Lincroft chip and it exists, or is not on a Lincroft at
135 * all, the we can go ahead with any reads & writes. If it's on a Lincroft,
136 * but doesn't exist, avoid the access altogether to keep the chip from
137 * hanging.
138 */
139static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
140{
141 /* This is a workaround for A0 LNC bug where PCI status register does
142 * not have new CAP bit set. can not be written by SW either.
143 *
144 * PCI header type in real LNC indicates a single function device, this
145 * will prevent probing other devices under the same function in PCI
146 * shim. Therefore, use the header type in shim instead.
147 */
148 if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE)
149 return 0;
150 if (bus == 0 && (devfn == PCI_DEVFN(2, 0) || devfn == PCI_DEVFN(0, 0)))
151 return 1;
152 return 0; /* langwell on others */
153}
154
155static int pci_read(struct pci_bus *bus, unsigned int devfn, int where,
156 int size, u32 *value)
157{
158 if (type1_access_ok(bus->number, devfn, where))
159 return pci_direct_conf1.read(pci_domain_nr(bus), bus->number,
160 devfn, where, size, value);
161 return raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
162 devfn, where, size, value);
163}
164
165static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
166 int size, u32 value)
167{
168 int offset;
169
170 /* On MRST, there is no PCI ROM BAR, this will cause a subsequent read
171 * to ROM BAR return 0 then being ignored.
172 */
173 if (where == PCI_ROM_ADDRESS)
174 return 0;
175
176 /*
177 * Devices with fixed BARs need special handling:
178 * - BAR sizing code will save, write ~0, read size, restore
179 * - so writes to fixed BARs need special handling
180 * - other writes to fixed BAR devices should go through mmconfig
181 */
182 offset = fixed_bar_cap(bus, devfn);
183 if (offset &&
184 (where >= PCI_BASE_ADDRESS_0 && where <= PCI_BASE_ADDRESS_5)) {
185 return pci_device_update_fixed(bus, devfn, where, size, value,
186 offset);
187 }
188
189 /*
190 * On Moorestown update both real & mmconfig space
191 * Note: early Lincroft silicon can't handle type 1 accesses to
192 * non-existent devices, so just eat the write in that case.
193 */
194 if (type1_access_ok(bus->number, devfn, where))
195 return pci_direct_conf1.write(pci_domain_nr(bus), bus->number,
196 devfn, where, size, value);
197 return raw_pci_ext_ops->write(pci_domain_nr(bus), bus->number, devfn,
198 where, size, value);
199}
200
201static int mrst_pci_irq_enable(struct pci_dev *dev)
202{
203 u8 pin;
204 struct io_apic_irq_attr irq_attr;
205
206 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
207
208 /* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
209 * IOAPIC RTE entries, so we just enable RTE for the device.
210 */
211 irq_attr.ioapic = mp_find_ioapic(dev->irq);
212 irq_attr.ioapic_pin = dev->irq;
213 irq_attr.trigger = 1; /* level */
214 irq_attr.polarity = 1; /* active low */
215 io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr);
216
217 return 0;
218}
219
220struct pci_ops pci_mrst_ops = {
221 .read = pci_read,
222 .write = pci_write,
223};
224
225/**
226 * pci_mrst_init - installs pci_mrst_ops
227 *
228 * Moorestown has an interesting PCI implementation (see above).
229 * Called when the early platform detection installs it.
230 */
231int __init pci_mrst_init(void)
232{
233 printk(KERN_INFO "Moorestown platform detected, using MRST PCI ops\n");
234 pci_mmcfg_late_init();
235 pcibios_enable_irq = mrst_pci_irq_enable;
236 pci_root_ops = pci_mrst_ops;
237 /* Continue with standard init */
238 return 1;
239}
240
241/*
242 * Langwell devices reside at fixed offsets, don't try to move them.
243 */
244static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev)
245{
246 unsigned long offset;
247 u32 size;
248 int i;
249
250 /* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */
251 offset = fixed_bar_cap(dev->bus, dev->devfn);
252 if (!offset || PCI_DEVFN(2, 0) == dev->devfn ||
253 PCI_DEVFN(2, 2) == dev->devfn)
254 return;
255
256 for (i = 0; i < PCI_ROM_RESOURCE; i++) {
257 pci_read_config_dword(dev, offset + 8 + (i * 4), &size);
258 dev->resource[i].end = dev->resource[i].start + size - 1;
259 dev->resource[i].flags |= IORESOURCE_PCI_FIXED;
260 }
261}
262DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixed_bar_fixup);
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 8eb295e116f6..8223738ad806 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -8,9 +8,7 @@
8#include <asm/apic.h> 8#include <asm/apic.h>
9#include <asm/mpspec.h> 9#include <asm/mpspec.h>
10#include <asm/pci_x86.h> 10#include <asm/pci_x86.h>
11 11#include <asm/numaq.h>
12#define XQUAD_PORTIO_BASE 0xfe400000
13#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
14 12
15#define BUS2QUAD(global) (mp_bus_id_to_node[global]) 13#define BUS2QUAD(global) (mp_bus_id_to_node[global])
16 14
@@ -18,8 +16,6 @@
18 16
19#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) 17#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
20 18
21#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
22
23#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ 19#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \
24 (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3)) 20 (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3))
25 21
@@ -152,14 +148,8 @@ int __init pci_numaq_init(void)
152{ 148{
153 int quad; 149 int quad;
154 150
155 if (!found_numaq)
156 return 0;
157
158 raw_pci_ops = &pci_direct_conf1_mq; 151 raw_pci_ops = &pci_direct_conf1_mq;
159 152
160 if (pcibios_scanned++)
161 return 0;
162
163 pci_root_bus = pcibios_scan_root(0); 153 pci_root_bus = pcibios_scan_root(0);
164 if (pci_root_bus) 154 if (pci_root_bus)
165 pci_bus_add_devices(pci_root_bus); 155 pci_bus_add_devices(pci_root_bus);
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index b889d824f7c6..b34815408f58 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -304,9 +304,6 @@ static struct pci_raw_ops pci_olpc_conf = {
304 304
305int __init pci_olpc_init(void) 305int __init pci_olpc_init(void)
306{ 306{
307 if (!machine_is_olpc() || olpc_has_vsa())
308 return -ENODEV;
309
310 printk(KERN_INFO "PCI: Using configuration type OLPC\n"); 307 printk(KERN_INFO "PCI: Using configuration type OLPC\n");
311 raw_pci_ops = &pci_olpc_conf; 308 raw_pci_ops = &pci_olpc_conf;
312 is_lx = is_geode_lx(); 309 is_lx = is_geode_lx();
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index bcead7a46871..03008f72eb04 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -69,9 +69,6 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
69 69
70int __init pci_visws_init(void) 70int __init pci_visws_init(void)
71{ 71{
72 if (!is_visws_box())
73 return -1;
74
75 pcibios_enable_irq = &pci_visws_enable_irq; 72 pcibios_enable_irq = &pci_visws_enable_irq;
76 pcibios_disable_irq = &pci_visws_disable_irq; 73 pcibios_disable_irq = &pci_visws_disable_irq;
77 74
@@ -90,5 +87,6 @@ int __init pci_visws_init(void)
90 pci_scan_bus_with_sysdata(pci_bus1); 87 pci_scan_bus_with_sysdata(pci_bus1);
91 pci_fixup_irqs(pci_common_swizzle, visws_map_irq); 88 pci_fixup_irqs(pci_common_swizzle, visws_map_irq);
92 pcibios_resource_survey(); 89 pcibios_resource_survey();
93 return 0; 90 /* Request bus scan */
91 return 1;
94} 92}
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c
index bee8d6ac2691..13403fc95a96 100644
--- a/arch/x86/tools/test_get_len.c
+++ b/arch/x86/tools/test_get_len.c
@@ -43,7 +43,7 @@ static int x86_64;
43static void usage(void) 43static void usage(void)
44{ 44{
45 fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |" 45 fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |"
46 " %s [-y|-n] [-v] \n", prog); 46 " %s [-y|-n] [-v]\n", prog);
47 fprintf(stderr, "\t-y 64bit mode\n"); 47 fprintf(stderr, "\t-y 64bit mode\n");
48 fprintf(stderr, "\t-n 32bit mode\n"); 48 fprintf(stderr, "\t-n 32bit mode\n");
49 fprintf(stderr, "\t-v verbose mode\n"); 49 fprintf(stderr, "\t-v verbose mode\n");
@@ -69,7 +69,7 @@ static void dump_field(FILE *fp, const char *name, const char *indent,
69 69
70static void dump_insn(FILE *fp, struct insn *insn) 70static void dump_insn(FILE *fp, struct insn *insn)
71{ 71{
72 fprintf(fp, "Instruction = { \n"); 72 fprintf(fp, "Instruction = {\n");
73 dump_field(fp, "prefixes", "\t", &insn->prefixes); 73 dump_field(fp, "prefixes", "\t", &insn->prefixes);
74 dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix); 74 dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix);
75 dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix); 75 dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 36daccb68642..b607239c1ba8 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -50,6 +50,7 @@
50#include <asm/traps.h> 50#include <asm/traps.h>
51#include <asm/setup.h> 51#include <asm/setup.h>
52#include <asm/desc.h> 52#include <asm/desc.h>
53#include <asm/pgalloc.h>
53#include <asm/pgtable.h> 54#include <asm/pgtable.h>
54#include <asm/tlbflush.h> 55#include <asm/tlbflush.h>
55#include <asm/reboot.h> 56#include <asm/reboot.h>
@@ -1094,6 +1095,12 @@ asmlinkage void __init xen_start_kernel(void)
1094 1095
1095 __supported_pte_mask |= _PAGE_IOMAP; 1096 __supported_pte_mask |= _PAGE_IOMAP;
1096 1097
1098 /*
1099 * Prevent page tables from being allocated in highmem, even
1100 * if CONFIG_HIGHPTE is enabled.
1101 */
1102 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
1103
1097 /* Work out if we support NX */ 1104 /* Work out if we support NX */
1098 x86_configure_nx(); 1105 x86_configure_nx();
1099 1106
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index bf4cd6bfe959..f9eb7de74f42 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1427,23 +1427,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1427#endif 1427#endif
1428} 1428}
1429 1429
1430#ifdef CONFIG_HIGHPTE
1431static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
1432{
1433 pgprot_t prot = PAGE_KERNEL;
1434
1435 if (PagePinned(page))
1436 prot = PAGE_KERNEL_RO;
1437
1438 if (0 && PageHighMem(page))
1439 printk("mapping highpte %lx type %d prot %s\n",
1440 page_to_pfn(page), type,
1441 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
1442
1443 return kmap_atomic_prot(page, type, prot);
1444}
1445#endif
1446
1447#ifdef CONFIG_X86_32 1430#ifdef CONFIG_X86_32
1448static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 1431static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1449{ 1432{
@@ -1902,10 +1885,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1902 .alloc_pmd_clone = paravirt_nop, 1885 .alloc_pmd_clone = paravirt_nop,
1903 .release_pmd = xen_release_pmd_init, 1886 .release_pmd = xen_release_pmd_init,
1904 1887
1905#ifdef CONFIG_HIGHPTE
1906 .kmap_atomic_pte = xen_kmap_atomic_pte,
1907#endif
1908
1909#ifdef CONFIG_X86_64 1888#ifdef CONFIG_X86_64
1910 .set_pte = xen_set_pte, 1889 .set_pte = xen_set_pte,
1911#else 1890#else
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 563d20504988..deafb65ef44e 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -361,7 +361,7 @@ static void xen_cpu_die(unsigned int cpu)
361 alternatives_smp_switch(0); 361 alternatives_smp_switch(0);
362} 362}
363 363
364static void __cpuinit xen_play_dead(void) /* used only with CPU_HOTPLUG */ 364static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */
365{ 365{
366 play_dead_common(); 366 play_dead_common();
367 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); 367 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 88e15deb8b82..22a2093b5862 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -90,9 +90,9 @@ ENTRY(xen_iret)
90 GET_THREAD_INFO(%eax) 90 GET_THREAD_INFO(%eax)
91 movl TI_cpu(%eax), %eax 91 movl TI_cpu(%eax), %eax
92 movl __per_cpu_offset(,%eax,4), %eax 92 movl __per_cpu_offset(,%eax,4), %eax
93 mov per_cpu__xen_vcpu(%eax), %eax 93 mov xen_vcpu(%eax), %eax
94#else 94#else
95 movl per_cpu__xen_vcpu, %eax 95 movl xen_vcpu, %eax
96#endif 96#endif
97 97
98 /* check IF state we're restoring */ 98 /* check IF state we're restoring */