aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig17
-rw-r--r--arch/x86/Kconfig.cpu2
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/boot/compressed/misc.c15
-rw-r--r--arch/x86/boot/mkcpustr.c2
-rw-r--r--arch/x86/boot/video-vga.c9
-rw-r--r--arch/x86/boot/video.c7
-rw-r--r--arch/x86/ia32/ia32_aout.c2
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/alternative.h16
-rw-r--r--arch/x86/include/asm/atomic.h299
-rw-r--r--arch/x86/include/asm/atomic64_32.h160
-rw-r--r--arch/x86/include/asm/atomic64_64.h224
-rw-r--r--arch/x86/include/asm/atomic_32.h415
-rw-r--r--arch/x86/include/asm/atomic_64.h485
-rw-r--r--arch/x86/include/asm/cpufeature.h4
-rw-r--r--arch/x86/include/asm/debugreg.h3
-rw-r--r--arch/x86/include/asm/e820.h5
-rw-r--r--arch/x86/include/asm/elf.h5
-rw-r--r--arch/x86/include/asm/fb.h4
-rw-r--r--arch/x86/include/asm/fixmap.h16
-rw-r--r--arch/x86/include/asm/highmem.h4
-rw-r--r--arch/x86/include/asm/hyperv.h186
-rw-r--r--arch/x86/include/asm/i387.h12
-rw-r--r--arch/x86/include/asm/io.h155
-rw-r--r--arch/x86/include/asm/io_32.h196
-rw-r--r--arch/x86/include/asm/io_64.h181
-rw-r--r--arch/x86/include/asm/kprobes.h31
-rw-r--r--arch/x86/include/asm/kvm_emulate.h17
-rw-r--r--arch/x86/include/asm/kvm_host.h60
-rw-r--r--arch/x86/include/asm/kvm_para.h1
-rw-r--r--arch/x86/include/asm/local.h37
-rw-r--r--arch/x86/include/asm/mmzone_64.h6
-rw-r--r--arch/x86/include/asm/nmi.h1
-rw-r--r--arch/x86/include/asm/numa_64.h5
-rw-r--r--arch/x86/include/asm/numaq.h4
-rw-r--r--arch/x86/include/asm/page_types.h1
-rw-r--r--arch/x86/include/asm/paravirt.h9
-rw-r--r--arch/x86/include/asm/paravirt_types.h4
-rw-r--r--arch/x86/include/asm/pci.h2
-rw-r--r--arch/x86/include/asm/pci_64.h2
-rw-r--r--arch/x86/include/asm/pci_x86.h1
-rw-r--r--arch/x86/include/asm/percpu.h119
-rw-r--r--arch/x86/include/asm/perf_event.h16
-rw-r--r--arch/x86/include/asm/pgalloc.h5
-rw-r--r--arch/x86/include/asm/pgtable_32.h6
-rw-r--r--arch/x86/include/asm/pgtable_64.h2
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/proto.h10
-rw-r--r--arch/x86/include/asm/ptrace.h4
-rw-r--r--arch/x86/include/asm/rwsem.h81
-rw-r--r--arch/x86/include/asm/smp.h9
-rw-r--r--arch/x86/include/asm/stacktrace.h2
-rw-r--r--arch/x86/include/asm/svm.h2
-rw-r--r--arch/x86/include/asm/syscall.h2
-rw-r--r--arch/x86/include/asm/system.h8
-rw-r--r--arch/x86/include/asm/uaccess_64.h21
-rw-r--r--arch/x86/include/asm/user.h58
-rw-r--r--arch/x86/include/asm/uv/bios.h11
-rw-r--r--arch/x86/include/asm/uv/uv.h1
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h3
-rw-r--r--arch/x86/include/asm/vmx.h5
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/include/asm/xsave.h2
-rw-r--r--arch/x86/kernel/acpi/boot.c29
-rw-r--r--arch/x86/kernel/alternative.c82
-rw-r--r--arch/x86/kernel/apb_timer.c4
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c52
-rw-r--r--arch/x86/kernel/apic/nmi.c6
-rw-r--r--arch/x86/kernel/apic/numaq_32.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c89
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/bios_uv.c39
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c620
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c6
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c334
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c208
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h6
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c94
-rw-r--r--arch/x86/kernel/cpu/perf_event.c1854
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c416
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c971
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c157
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c11
-rw-r--r--arch/x86/kernel/dumpstack_32.c5
-rw-r--r--arch/x86/kernel/dumpstack_64.c6
-rw-r--r--arch/x86/kernel/e820.c357
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/ftrace.c36
-rw-r--r--arch/x86/kernel/head32.c10
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/hw_breakpoint.c40
-rw-r--r--arch/x86/kernel/i387.c71
-rw-r--r--arch/x86/kernel/kprobes.c614
-rw-r--r--arch/x86/kernel/microcode_intel.c2
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c7
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/pci-dma.c13
-rw-r--r--arch/x86/kernel/process.c7
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c7
-rw-r--r--arch/x86/kernel/ptrace.c65
-rw-r--r--arch/x86/kernel/setup.c31
-rw-r--r--arch/x86/kernel/setup_percpu.c6
-rw-r--r--arch/x86/kernel/smpboot.c1
-rw-r--r--arch/x86/kernel/traps.c3
-rw-r--r--arch/x86/kernel/tsc.c4
-rw-r--r--arch/x86/kernel/uv_sysfs.c6
-rw-r--r--arch/x86/kernel/vmi_32.c35
-rw-r--r--arch/x86/kernel/vmlinux.lds.S4
-rw-r--r--arch/x86/kernel/vsyscall_64.c3
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c3
-rw-r--r--arch/x86/kernel/x86_init.c3
-rw-r--r--arch/x86/kernel/xsave.c1
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/emulate.c440
-rw-r--r--arch/x86/kvm/i8254.c23
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/i8259.c46
-rw-r--r--arch/x86/kvm/irq.h3
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h31
-rw-r--r--arch/x86/kvm/lapic.c31
-rw-r--r--arch/x86/kvm/lapic.h8
-rw-r--r--arch/x86/kvm/mmu.c137
-rw-r--r--arch/x86/kvm/mmu.h35
-rw-r--r--arch/x86/kvm/paging_tmpl.h13
-rw-r--r--arch/x86/kvm/svm.c237
-rw-r--r--arch/x86/kvm/trace.h59
-rw-r--r--arch/x86/kvm/vmx.c396
-rw-r--r--arch/x86/kvm/x86.c1098
-rw-r--r--arch/x86/kvm/x86.h30
-rw-r--r--arch/x86/lib/Makefile5
-rw-r--r--arch/x86/lib/cache-smp.c19
-rw-r--r--arch/x86/lib/copy_user_64.S6
-rw-r--r--arch/x86/lib/io_64.c25
-rw-r--r--arch/x86/lib/memcpy_64.S23
-rw-r--r--arch/x86/lib/memset_64.S18
-rw-r--r--arch/x86/lib/rwsem_64.S81
-rw-r--r--arch/x86/mm/init.c7
-rw-r--r--arch/x86/mm/init_32.c15
-rw-r--r--arch/x86/mm/init_64.c9
-rw-r--r--arch/x86/mm/ioremap.c41
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c2
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c16
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h2
-rw-r--r--arch/x86/mm/mmap.c4
-rw-r--r--arch/x86/mm/numa_32.c3
-rw-r--r--arch/x86/mm/numa_64.c332
-rw-r--r--arch/x86/mm/pgtable.c31
-rw-r--r--arch/x86/mm/tlb.c8
-rw-r--r--arch/x86/oprofile/nmi_int.c17
-rw-r--r--arch/x86/oprofile/op_model_amd.c244
-rw-r--r--arch/x86/oprofile/op_model_p4.c6
-rw-r--r--arch/x86/oprofile/op_model_ppro.c17
-rw-r--r--arch/x86/oprofile/op_x86_model.h20
-rw-r--r--arch/x86/pci/Makefile3
-rw-r--r--arch/x86/pci/acpi.c82
-rw-r--r--arch/x86/pci/amd_bus.c127
-rw-r--r--arch/x86/pci/bus_numa.c28
-rw-r--r--arch/x86/pci/bus_numa.h12
-rw-r--r--arch/x86/pci/common.c3
-rw-r--r--arch/x86/pci/i386.c18
-rw-r--r--arch/x86/pci/irq.c2
-rw-r--r--arch/x86/pci/mmconfig-shared.c17
-rw-r--r--arch/x86/pci/numaq_32.c6
-rw-r--r--arch/x86/tools/test_get_len.c4
-rw-r--r--arch/x86/xen/enlighten.c7
-rw-r--r--arch/x86/xen/mmu.c21
-rw-r--r--arch/x86/xen/xen-asm_32.S4
180 files changed, 7962 insertions, 5283 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d22686f69de8..e98440371525 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -31,6 +31,7 @@ config X86
31 select ARCH_WANT_FRAME_POINTERS 31 select ARCH_WANT_FRAME_POINTERS
32 select HAVE_DMA_ATTRS 32 select HAVE_DMA_ATTRS
33 select HAVE_KRETPROBES 33 select HAVE_KRETPROBES
34 select HAVE_OPTPROBES
34 select HAVE_FTRACE_MCOUNT_RECORD 35 select HAVE_FTRACE_MCOUNT_RECORD
35 select HAVE_DYNAMIC_FTRACE 36 select HAVE_DYNAMIC_FTRACE
36 select HAVE_FUNCTION_TRACER 37 select HAVE_FUNCTION_TRACER
@@ -45,6 +46,7 @@ config X86
45 select HAVE_GENERIC_DMA_COHERENT if X86_32 46 select HAVE_GENERIC_DMA_COHERENT if X86_32
46 select HAVE_EFFICIENT_UNALIGNED_ACCESS 47 select HAVE_EFFICIENT_UNALIGNED_ACCESS
47 select USER_STACKTRACE_SUPPORT 48 select USER_STACKTRACE_SUPPORT
49 select HAVE_REGS_AND_STACK_ACCESS_API
48 select HAVE_DMA_API_DEBUG 50 select HAVE_DMA_API_DEBUG
49 select HAVE_KERNEL_GZIP 51 select HAVE_KERNEL_GZIP
50 select HAVE_KERNEL_BZIP2 52 select HAVE_KERNEL_BZIP2
@@ -183,6 +185,9 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
183config ARCH_SUPPORTS_DEBUG_PAGEALLOC 185config ARCH_SUPPORTS_DEBUG_PAGEALLOC
184 def_bool y 186 def_bool y
185 187
188config HAVE_EARLY_RES
189 def_bool y
190
186config HAVE_INTEL_TXT 191config HAVE_INTEL_TXT
187 def_bool y 192 def_bool y
188 depends on EXPERIMENTAL && DMAR && ACPI 193 depends on EXPERIMENTAL && DMAR && ACPI
@@ -573,6 +578,18 @@ config PARAVIRT_DEBUG
573 Enable to debug paravirt_ops internals. Specifically, BUG if 578 Enable to debug paravirt_ops internals. Specifically, BUG if
574 a paravirt_op is missing when it is called. 579 a paravirt_op is missing when it is called.
575 580
581config NO_BOOTMEM
582 default y
583 bool "Disable Bootmem code"
584 ---help---
585 Use early_res directly instead of bootmem before slab is ready.
586 - allocator (buddy) [generic]
587 - early allocator (bootmem) [generic]
588 - very early allocator (reserve_early*()) [x86]
589 - very very early allocator (early brk model) [x86]
590 So reduce one layer between early allocator to final allocator
591
592
576config MEMTEST 593config MEMTEST
577 bool "Memtest" 594 bool "Memtest"
578 ---help--- 595 ---help---
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index f20ddf84a893..a19829374e6a 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -319,7 +319,7 @@ config X86_L1_CACHE_SHIFT
319 319
320config X86_XADD 320config X86_XADD
321 def_bool y 321 def_bool y
322 depends on X86_32 && !M386 322 depends on X86_64 || !M386
323 323
324config X86_PPRO_FENCE 324config X86_PPRO_FENCE
325 bool "PentiumPro memory ordering errata workaround" 325 bool "PentiumPro memory ordering errata workaround"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 78b32be55e9e..0a43dc515e4c 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -135,9 +135,7 @@ drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
135# suspend and hibernation support 135# suspend and hibernation support
136drivers-$(CONFIG_PM) += arch/x86/power/ 136drivers-$(CONFIG_PM) += arch/x86/power/
137 137
138ifeq ($(CONFIG_X86_32),y)
139drivers-$(CONFIG_FB) += arch/x86/video/ 138drivers-$(CONFIG_FB) += arch/x86/video/
140endif
141 139
142#### 140####
143# boot loader support. Several targets are kept for legacy purposes 141# boot loader support. Several targets are kept for legacy purposes
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 3b22fe8ab91b..51e240779a44 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -19,11 +19,6 @@
19#define _ASM_X86_DESC_H 1 19#define _ASM_X86_DESC_H 1
20#endif 20#endif
21 21
22#ifdef CONFIG_X86_64
23#define _LINUX_STRING_H_ 1
24#define __LINUX_BITMAP_H 1
25#endif
26
27#include <linux/linkage.h> 22#include <linux/linkage.h>
28#include <linux/screen_info.h> 23#include <linux/screen_info.h>
29#include <linux/elf.h> 24#include <linux/elf.h>
@@ -131,8 +126,8 @@ static void error(char *m);
131static struct boot_params *real_mode; /* Pointer to real-mode data */ 126static struct boot_params *real_mode; /* Pointer to real-mode data */
132static int quiet; 127static int quiet;
133 128
134static void *memset(void *s, int c, unsigned n); 129void *memset(void *s, int c, size_t n);
135void *memcpy(void *dest, const void *src, unsigned n); 130void *memcpy(void *dest, const void *src, size_t n);
136 131
137static void __putstr(int, const char *); 132static void __putstr(int, const char *);
138#define putstr(__x) __putstr(0, __x) 133#define putstr(__x) __putstr(0, __x)
@@ -185,11 +180,9 @@ static void __putstr(int error, const char *s)
185 return; 180 return;
186#endif 181#endif
187 182
188#ifdef CONFIG_X86_32
189 if (real_mode->screen_info.orig_video_mode == 0 && 183 if (real_mode->screen_info.orig_video_mode == 0 &&
190 lines == 0 && cols == 0) 184 lines == 0 && cols == 0)
191 return; 185 return;
192#endif
193 186
194 x = real_mode->screen_info.orig_x; 187 x = real_mode->screen_info.orig_x;
195 y = real_mode->screen_info.orig_y; 188 y = real_mode->screen_info.orig_y;
@@ -223,7 +216,7 @@ static void __putstr(int error, const char *s)
223 outb(0xff & (pos >> 1), vidport+1); 216 outb(0xff & (pos >> 1), vidport+1);
224} 217}
225 218
226static void *memset(void *s, int c, unsigned n) 219void *memset(void *s, int c, size_t n)
227{ 220{
228 int i; 221 int i;
229 char *ss = s; 222 char *ss = s;
@@ -233,7 +226,7 @@ static void *memset(void *s, int c, unsigned n)
233 return s; 226 return s;
234} 227}
235 228
236void *memcpy(void *dest, const void *src, unsigned n) 229void *memcpy(void *dest, const void *src, size_t n)
237{ 230{
238 int i; 231 int i;
239 const char *s = src; 232 const char *s = src;
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index 8ef60f20b371..919257f526f2 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -22,7 +22,7 @@ int main(void)
22 int i, j; 22 int i, j;
23 const char *str; 23 const char *str;
24 24
25 printf("static const char x86_cap_strs[] = \n"); 25 printf("static const char x86_cap_strs[] =\n");
26 26
27 for (i = 0; i < NCAPINTS; i++) { 27 for (i = 0; i < NCAPINTS; i++) {
28 for (j = 0; j < 32; j++) { 28 for (j = 0; j < 32; j++) {
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 819caa1f2008..ed7aeff786b2 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -42,22 +42,15 @@ static u8 vga_set_basic_mode(void)
42{ 42{
43 struct biosregs ireg, oreg; 43 struct biosregs ireg, oreg;
44 u16 ax; 44 u16 ax;
45 u8 rows;
46 u8 mode; 45 u8 mode;
47 46
48 initregs(&ireg); 47 initregs(&ireg);
49 48
49 /* Query current mode */
50 ax = 0x0f00; 50 ax = 0x0f00;
51 intcall(0x10, &ireg, &oreg); 51 intcall(0x10, &ireg, &oreg);
52 mode = oreg.al; 52 mode = oreg.al;
53 53
54 set_fs(0);
55 rows = rdfs8(0x484); /* rows minus one */
56
57 if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
58 (rows == 0 || rows == 24))
59 return mode;
60
61 if (mode != 3 && mode != 7) 54 if (mode != 3 && mode != 7)
62 mode = 3; 55 mode = 3;
63 56
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index f767164cd5df..43eda284d27f 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -298,11 +298,18 @@ static void restore_screen(void)
298 } 298 }
299 299
300 /* Restore cursor position */ 300 /* Restore cursor position */
301 if (saved.curx >= xs)
302 saved.curx = xs-1;
303 if (saved.cury >= ys)
304 saved.cury = ys-1;
305
301 initregs(&ireg); 306 initregs(&ireg);
302 ireg.ah = 0x02; /* Set cursor position */ 307 ireg.ah = 0x02; /* Set cursor position */
303 ireg.dh = saved.cury; 308 ireg.dh = saved.cury;
304 ireg.dl = saved.curx; 309 ireg.dl = saved.curx;
305 intcall(0x10, &ireg, NULL); 310 intcall(0x10, &ireg, NULL);
311
312 store_cursor_position();
306} 313}
307 314
308void set_video(void) 315void set_video(void)
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 14531abdd0ce..280c019cfad8 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -297,7 +297,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
297 * size limits imposed on them by creating programs with large 297 * size limits imposed on them by creating programs with large
298 * arrays in the data or bss. 298 * arrays in the data or bss.
299 */ 299 */
300 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; 300 rlim = rlimit(RLIMIT_DATA);
301 if (rlim >= RLIM_INFINITY) 301 if (rlim >= RLIM_INFINITY)
302 rlim = ~0; 302 rlim = ~0;
303 if (ex.a_data + ex.a_bss > rlim) 303 if (ex.a_data + ex.a_bss > rlim)
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 9f828f87ca35..493092efaa3b 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -11,6 +11,7 @@ header-y += sigcontext32.h
11header-y += ucontext.h 11header-y += ucontext.h
12header-y += processor-flags.h 12header-y += processor-flags.h
13header-y += hw_breakpoint.h 13header-y += hw_breakpoint.h
14header-y += hyperv.h
14 15
15unifdef-y += e820.h 16unifdef-y += e820.h
16unifdef-y += ist.h 17unifdef-y += ist.h
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 69b74a7b877f..b09ec55650b3 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -65,12 +65,17 @@ extern void alternatives_smp_module_add(struct module *mod, char *name,
65 void *text, void *text_end); 65 void *text, void *text_end);
66extern void alternatives_smp_module_del(struct module *mod); 66extern void alternatives_smp_module_del(struct module *mod);
67extern void alternatives_smp_switch(int smp); 67extern void alternatives_smp_switch(int smp);
68extern int alternatives_text_reserved(void *start, void *end);
68#else 69#else
69static inline void alternatives_smp_module_add(struct module *mod, char *name, 70static inline void alternatives_smp_module_add(struct module *mod, char *name,
70 void *locks, void *locks_end, 71 void *locks, void *locks_end,
71 void *text, void *text_end) {} 72 void *text, void *text_end) {}
72static inline void alternatives_smp_module_del(struct module *mod) {} 73static inline void alternatives_smp_module_del(struct module *mod) {}
73static inline void alternatives_smp_switch(int smp) {} 74static inline void alternatives_smp_switch(int smp) {}
75static inline int alternatives_text_reserved(void *start, void *end)
76{
77 return 0;
78}
74#endif /* CONFIG_SMP */ 79#endif /* CONFIG_SMP */
75 80
76/* alternative assembly primitive: */ 81/* alternative assembly primitive: */
@@ -125,11 +130,16 @@ static inline void alternatives_smp_switch(int smp) {}
125 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ 130 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
126 : output : "i" (0), ## input) 131 : output : "i" (0), ## input)
127 132
133/* Like alternative_io, but for replacing a direct call with another one. */
134#define alternative_call(oldfunc, newfunc, feature, output, input...) \
135 asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \
136 : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
137
128/* 138/*
129 * use this macro(s) if you need more than one output parameter 139 * use this macro(s) if you need more than one output parameter
130 * in alternative_io 140 * in alternative_io
131 */ 141 */
132#define ASM_OUTPUT2(a, b) a, b 142#define ASM_OUTPUT2(a...) a
133 143
134struct paravirt_patch_site; 144struct paravirt_patch_site;
135#ifdef CONFIG_PARAVIRT 145#ifdef CONFIG_PARAVIRT
@@ -155,10 +165,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
155 * invalid instruction possible) or if the instructions are changed from a 165 * invalid instruction possible) or if the instructions are changed from a
156 * consistent state to another consistent state atomically. 166 * consistent state to another consistent state atomically.
157 * More care must be taken when modifying code in the SMP case because of 167 * More care must be taken when modifying code in the SMP case because of
158 * Intel's errata. 168 * Intel's errata. text_poke_smp() takes care that errata, but still
169 * doesn't support NMI/MCE handler code modifying.
159 * On the local CPU you need to be protected again NMI or MCE handlers seeing an 170 * On the local CPU you need to be protected again NMI or MCE handlers seeing an
160 * inconsistent instruction while you patch. 171 * inconsistent instruction while you patch.
161 */ 172 */
162extern void *text_poke(void *addr, const void *opcode, size_t len); 173extern void *text_poke(void *addr, const void *opcode, size_t len);
174extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
163 175
164#endif /* _ASM_X86_ALTERNATIVE_H */ 176#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 4e1b8873c474..8f8217b9bdac 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -1,5 +1,300 @@
1#ifndef _ASM_X86_ATOMIC_H
2#define _ASM_X86_ATOMIC_H
3
4#include <linux/compiler.h>
5#include <linux/types.h>
6#include <asm/processor.h>
7#include <asm/alternative.h>
8#include <asm/cmpxchg.h>
9
10/*
11 * Atomic operations that C can't guarantee us. Useful for
12 * resource counting etc..
13 */
14
15#define ATOMIC_INIT(i) { (i) }
16
17/**
18 * atomic_read - read atomic variable
19 * @v: pointer of type atomic_t
20 *
21 * Atomically reads the value of @v.
22 */
23static inline int atomic_read(const atomic_t *v)
24{
25 return v->counter;
26}
27
28/**
29 * atomic_set - set atomic variable
30 * @v: pointer of type atomic_t
31 * @i: required value
32 *
33 * Atomically sets the value of @v to @i.
34 */
35static inline void atomic_set(atomic_t *v, int i)
36{
37 v->counter = i;
38}
39
40/**
41 * atomic_add - add integer to atomic variable
42 * @i: integer value to add
43 * @v: pointer of type atomic_t
44 *
45 * Atomically adds @i to @v.
46 */
47static inline void atomic_add(int i, atomic_t *v)
48{
49 asm volatile(LOCK_PREFIX "addl %1,%0"
50 : "+m" (v->counter)
51 : "ir" (i));
52}
53
54/**
55 * atomic_sub - subtract integer from atomic variable
56 * @i: integer value to subtract
57 * @v: pointer of type atomic_t
58 *
59 * Atomically subtracts @i from @v.
60 */
61static inline void atomic_sub(int i, atomic_t *v)
62{
63 asm volatile(LOCK_PREFIX "subl %1,%0"
64 : "+m" (v->counter)
65 : "ir" (i));
66}
67
68/**
69 * atomic_sub_and_test - subtract value from variable and test result
70 * @i: integer value to subtract
71 * @v: pointer of type atomic_t
72 *
73 * Atomically subtracts @i from @v and returns
74 * true if the result is zero, or false for all
75 * other cases.
76 */
77static inline int atomic_sub_and_test(int i, atomic_t *v)
78{
79 unsigned char c;
80
81 asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
82 : "+m" (v->counter), "=qm" (c)
83 : "ir" (i) : "memory");
84 return c;
85}
86
87/**
88 * atomic_inc - increment atomic variable
89 * @v: pointer of type atomic_t
90 *
91 * Atomically increments @v by 1.
92 */
93static inline void atomic_inc(atomic_t *v)
94{
95 asm volatile(LOCK_PREFIX "incl %0"
96 : "+m" (v->counter));
97}
98
99/**
100 * atomic_dec - decrement atomic variable
101 * @v: pointer of type atomic_t
102 *
103 * Atomically decrements @v by 1.
104 */
105static inline void atomic_dec(atomic_t *v)
106{
107 asm volatile(LOCK_PREFIX "decl %0"
108 : "+m" (v->counter));
109}
110
111/**
112 * atomic_dec_and_test - decrement and test
113 * @v: pointer of type atomic_t
114 *
115 * Atomically decrements @v by 1 and
116 * returns true if the result is 0, or false for all other
117 * cases.
118 */
119static inline int atomic_dec_and_test(atomic_t *v)
120{
121 unsigned char c;
122
123 asm volatile(LOCK_PREFIX "decl %0; sete %1"
124 : "+m" (v->counter), "=qm" (c)
125 : : "memory");
126 return c != 0;
127}
128
129/**
130 * atomic_inc_and_test - increment and test
131 * @v: pointer of type atomic_t
132 *
133 * Atomically increments @v by 1
134 * and returns true if the result is zero, or false for all
135 * other cases.
136 */
137static inline int atomic_inc_and_test(atomic_t *v)
138{
139 unsigned char c;
140
141 asm volatile(LOCK_PREFIX "incl %0; sete %1"
142 : "+m" (v->counter), "=qm" (c)
143 : : "memory");
144 return c != 0;
145}
146
147/**
148 * atomic_add_negative - add and test if negative
149 * @i: integer value to add
150 * @v: pointer of type atomic_t
151 *
152 * Atomically adds @i to @v and returns true
153 * if the result is negative, or false when
154 * result is greater than or equal to zero.
155 */
156static inline int atomic_add_negative(int i, atomic_t *v)
157{
158 unsigned char c;
159
160 asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
161 : "+m" (v->counter), "=qm" (c)
162 : "ir" (i) : "memory");
163 return c;
164}
165
166/**
167 * atomic_add_return - add integer and return
168 * @i: integer value to add
169 * @v: pointer of type atomic_t
170 *
171 * Atomically adds @i to @v and returns @i + @v
172 */
173static inline int atomic_add_return(int i, atomic_t *v)
174{
175 int __i;
176#ifdef CONFIG_M386
177 unsigned long flags;
178 if (unlikely(boot_cpu_data.x86 <= 3))
179 goto no_xadd;
180#endif
181 /* Modern 486+ processor */
182 __i = i;
183 asm volatile(LOCK_PREFIX "xaddl %0, %1"
184 : "+r" (i), "+m" (v->counter)
185 : : "memory");
186 return i + __i;
187
188#ifdef CONFIG_M386
189no_xadd: /* Legacy 386 processor */
190 raw_local_irq_save(flags);
191 __i = atomic_read(v);
192 atomic_set(v, i + __i);
193 raw_local_irq_restore(flags);
194 return i + __i;
195#endif
196}
197
198/**
199 * atomic_sub_return - subtract integer and return
200 * @v: pointer of type atomic_t
201 * @i: integer value to subtract
202 *
203 * Atomically subtracts @i from @v and returns @v - @i
204 */
205static inline int atomic_sub_return(int i, atomic_t *v)
206{
207 return atomic_add_return(-i, v);
208}
209
210#define atomic_inc_return(v) (atomic_add_return(1, v))
211#define atomic_dec_return(v) (atomic_sub_return(1, v))
212
213static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
214{
215 return cmpxchg(&v->counter, old, new);
216}
217
218static inline int atomic_xchg(atomic_t *v, int new)
219{
220 return xchg(&v->counter, new);
221}
222
223/**
224 * atomic_add_unless - add unless the number is already a given value
225 * @v: pointer of type atomic_t
226 * @a: the amount to add to v...
227 * @u: ...unless v is equal to u.
228 *
229 * Atomically adds @a to @v, so long as @v was not already @u.
230 * Returns non-zero if @v was not @u, and zero otherwise.
231 */
232static inline int atomic_add_unless(atomic_t *v, int a, int u)
233{
234 int c, old;
235 c = atomic_read(v);
236 for (;;) {
237 if (unlikely(c == (u)))
238 break;
239 old = atomic_cmpxchg((v), c, c + (a));
240 if (likely(old == c))
241 break;
242 c = old;
243 }
244 return c != (u);
245}
246
247#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
248
249/**
250 * atomic_inc_short - increment of a short integer
251 * @v: pointer to type int
252 *
253 * Atomically adds 1 to @v
254 * Returns the new value of @u
255 */
256static inline short int atomic_inc_short(short int *v)
257{
258 asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
259 return *v;
260}
261
262#ifdef CONFIG_X86_64
263/**
264 * atomic_or_long - OR of two long integers
265 * @v1: pointer to type unsigned long
266 * @v2: pointer to type unsigned long
267 *
268 * Atomically ORs @v1 and @v2
269 * Returns the result of the OR
270 */
271static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
272{
273 asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2));
274}
275#endif
276
277/* These are x86-specific, used by some header files */
278#define atomic_clear_mask(mask, addr) \
279 asm volatile(LOCK_PREFIX "andl %0,%1" \
280 : : "r" (~(mask)), "m" (*(addr)) : "memory")
281
282#define atomic_set_mask(mask, addr) \
283 asm volatile(LOCK_PREFIX "orl %0,%1" \
284 : : "r" ((unsigned)(mask)), "m" (*(addr)) \
285 : "memory")
286
287/* Atomic operations are already serializing on x86 */
288#define smp_mb__before_atomic_dec() barrier()
289#define smp_mb__after_atomic_dec() barrier()
290#define smp_mb__before_atomic_inc() barrier()
291#define smp_mb__after_atomic_inc() barrier()
292
1#ifdef CONFIG_X86_32 293#ifdef CONFIG_X86_32
2# include "atomic_32.h" 294# include "atomic64_32.h"
3#else 295#else
4# include "atomic_64.h" 296# include "atomic64_64.h"
5#endif 297#endif
298
299#include <asm-generic/atomic-long.h>
300#endif /* _ASM_X86_ATOMIC_H */
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
new file mode 100644
index 000000000000..03027bf28de5
--- /dev/null
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -0,0 +1,160 @@
1#ifndef _ASM_X86_ATOMIC64_32_H
2#define _ASM_X86_ATOMIC64_32_H
3
4#include <linux/compiler.h>
5#include <linux/types.h>
6#include <asm/processor.h>
7//#include <asm/cmpxchg.h>
8
9/* An 64bit atomic type */
10
11typedef struct {
12 u64 __aligned(8) counter;
13} atomic64_t;
14
15#define ATOMIC64_INIT(val) { (val) }
16
17extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
18
19/**
20 * atomic64_xchg - xchg atomic64 variable
21 * @ptr: pointer to type atomic64_t
22 * @new_val: value to assign
23 *
24 * Atomically xchgs the value of @ptr to @new_val and returns
25 * the old value.
26 */
27extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
28
29/**
30 * atomic64_set - set atomic64 variable
31 * @ptr: pointer to type atomic64_t
32 * @new_val: value to assign
33 *
34 * Atomically sets the value of @ptr to @new_val.
35 */
36extern void atomic64_set(atomic64_t *ptr, u64 new_val);
37
38/**
39 * atomic64_read - read atomic64 variable
40 * @ptr: pointer to type atomic64_t
41 *
42 * Atomically reads the value of @ptr and returns it.
43 */
44static inline u64 atomic64_read(atomic64_t *ptr)
45{
46 u64 res;
47
48 /*
49 * Note, we inline this atomic64_t primitive because
50 * it only clobbers EAX/EDX and leaves the others
51 * untouched. We also (somewhat subtly) rely on the
52 * fact that cmpxchg8b returns the current 64-bit value
53 * of the memory location we are touching:
54 */
55 asm volatile(
56 "mov %%ebx, %%eax\n\t"
57 "mov %%ecx, %%edx\n\t"
58 LOCK_PREFIX "cmpxchg8b %1\n"
59 : "=&A" (res)
60 : "m" (*ptr)
61 );
62
63 return res;
64}
65
66extern u64 atomic64_read(atomic64_t *ptr);
67
68/**
69 * atomic64_add_return - add and return
70 * @delta: integer value to add
71 * @ptr: pointer to type atomic64_t
72 *
73 * Atomically adds @delta to @ptr and returns @delta + *@ptr
74 */
75extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
76
77/*
78 * Other variants with different arithmetic operators:
79 */
80extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
81extern u64 atomic64_inc_return(atomic64_t *ptr);
82extern u64 atomic64_dec_return(atomic64_t *ptr);
83
84/**
85 * atomic64_add - add integer to atomic64 variable
86 * @delta: integer value to add
87 * @ptr: pointer to type atomic64_t
88 *
89 * Atomically adds @delta to @ptr.
90 */
91extern void atomic64_add(u64 delta, atomic64_t *ptr);
92
93/**
94 * atomic64_sub - subtract the atomic64 variable
95 * @delta: integer value to subtract
96 * @ptr: pointer to type atomic64_t
97 *
98 * Atomically subtracts @delta from @ptr.
99 */
100extern void atomic64_sub(u64 delta, atomic64_t *ptr);
101
102/**
103 * atomic64_sub_and_test - subtract value from variable and test result
104 * @delta: integer value to subtract
105 * @ptr: pointer to type atomic64_t
106 *
107 * Atomically subtracts @delta from @ptr and returns
108 * true if the result is zero, or false for all
109 * other cases.
110 */
111extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
112
113/**
114 * atomic64_inc - increment atomic64 variable
115 * @ptr: pointer to type atomic64_t
116 *
117 * Atomically increments @ptr by 1.
118 */
119extern void atomic64_inc(atomic64_t *ptr);
120
121/**
122 * atomic64_dec - decrement atomic64 variable
123 * @ptr: pointer to type atomic64_t
124 *
125 * Atomically decrements @ptr by 1.
126 */
127extern void atomic64_dec(atomic64_t *ptr);
128
129/**
130 * atomic64_dec_and_test - decrement and test
131 * @ptr: pointer to type atomic64_t
132 *
133 * Atomically decrements @ptr by 1 and
134 * returns true if the result is 0, or false for all other
135 * cases.
136 */
137extern int atomic64_dec_and_test(atomic64_t *ptr);
138
139/**
140 * atomic64_inc_and_test - increment and test
141 * @ptr: pointer to type atomic64_t
142 *
143 * Atomically increments @ptr by 1
144 * and returns true if the result is zero, or false for all
145 * other cases.
146 */
147extern int atomic64_inc_and_test(atomic64_t *ptr);
148
149/**
150 * atomic64_add_negative - add and test if negative
151 * @delta: integer value to add
152 * @ptr: pointer to type atomic64_t
153 *
154 * Atomically adds @delta to @ptr and returns true
155 * if the result is negative, or false when
156 * result is greater than or equal to zero.
157 */
158extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
159
160#endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
new file mode 100644
index 000000000000..51c5b4056929
--- /dev/null
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -0,0 +1,224 @@
1#ifndef _ASM_X86_ATOMIC64_64_H
2#define _ASM_X86_ATOMIC64_64_H
3
4#include <linux/types.h>
5#include <asm/alternative.h>
6#include <asm/cmpxchg.h>
7
8/* The 64-bit atomic type */
9
10#define ATOMIC64_INIT(i) { (i) }
11
12/**
13 * atomic64_read - read atomic64 variable
14 * @v: pointer of type atomic64_t
15 *
16 * Atomically reads the value of @v.
17 * Doesn't imply a read memory barrier.
18 */
19static inline long atomic64_read(const atomic64_t *v)
20{
21 return v->counter;
22}
23
24/**
25 * atomic64_set - set atomic64 variable
26 * @v: pointer to type atomic64_t
27 * @i: required value
28 *
29 * Atomically sets the value of @v to @i.
30 */
31static inline void atomic64_set(atomic64_t *v, long i)
32{
33 v->counter = i;
34}
35
36/**
37 * atomic64_add - add integer to atomic64 variable
38 * @i: integer value to add
39 * @v: pointer to type atomic64_t
40 *
41 * Atomically adds @i to @v.
42 */
43static inline void atomic64_add(long i, atomic64_t *v)
44{
45 asm volatile(LOCK_PREFIX "addq %1,%0"
46 : "=m" (v->counter)
47 : "er" (i), "m" (v->counter));
48}
49
50/**
51 * atomic64_sub - subtract the atomic64 variable
52 * @i: integer value to subtract
53 * @v: pointer to type atomic64_t
54 *
55 * Atomically subtracts @i from @v.
56 */
57static inline void atomic64_sub(long i, atomic64_t *v)
58{
59 asm volatile(LOCK_PREFIX "subq %1,%0"
60 : "=m" (v->counter)
61 : "er" (i), "m" (v->counter));
62}
63
64/**
65 * atomic64_sub_and_test - subtract value from variable and test result
66 * @i: integer value to subtract
67 * @v: pointer to type atomic64_t
68 *
69 * Atomically subtracts @i from @v and returns
70 * true if the result is zero, or false for all
71 * other cases.
72 */
73static inline int atomic64_sub_and_test(long i, atomic64_t *v)
74{
75 unsigned char c;
76
77 asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
78 : "=m" (v->counter), "=qm" (c)
79 : "er" (i), "m" (v->counter) : "memory");
80 return c;
81}
82
83/**
84 * atomic64_inc - increment atomic64 variable
85 * @v: pointer to type atomic64_t
86 *
87 * Atomically increments @v by 1.
88 */
89static inline void atomic64_inc(atomic64_t *v)
90{
91 asm volatile(LOCK_PREFIX "incq %0"
92 : "=m" (v->counter)
93 : "m" (v->counter));
94}
95
96/**
97 * atomic64_dec - decrement atomic64 variable
98 * @v: pointer to type atomic64_t
99 *
100 * Atomically decrements @v by 1.
101 */
102static inline void atomic64_dec(atomic64_t *v)
103{
104 asm volatile(LOCK_PREFIX "decq %0"
105 : "=m" (v->counter)
106 : "m" (v->counter));
107}
108
109/**
110 * atomic64_dec_and_test - decrement and test
111 * @v: pointer to type atomic64_t
112 *
113 * Atomically decrements @v by 1 and
114 * returns true if the result is 0, or false for all other
115 * cases.
116 */
117static inline int atomic64_dec_and_test(atomic64_t *v)
118{
119 unsigned char c;
120
121 asm volatile(LOCK_PREFIX "decq %0; sete %1"
122 : "=m" (v->counter), "=qm" (c)
123 : "m" (v->counter) : "memory");
124 return c != 0;
125}
126
127/**
128 * atomic64_inc_and_test - increment and test
129 * @v: pointer to type atomic64_t
130 *
131 * Atomically increments @v by 1
132 * and returns true if the result is zero, or false for all
133 * other cases.
134 */
135static inline int atomic64_inc_and_test(atomic64_t *v)
136{
137 unsigned char c;
138
139 asm volatile(LOCK_PREFIX "incq %0; sete %1"
140 : "=m" (v->counter), "=qm" (c)
141 : "m" (v->counter) : "memory");
142 return c != 0;
143}
144
145/**
146 * atomic64_add_negative - add and test if negative
147 * @i: integer value to add
148 * @v: pointer to type atomic64_t
149 *
150 * Atomically adds @i to @v and returns true
151 * if the result is negative, or false when
152 * result is greater than or equal to zero.
153 */
154static inline int atomic64_add_negative(long i, atomic64_t *v)
155{
156 unsigned char c;
157
158 asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
159 : "=m" (v->counter), "=qm" (c)
160 : "er" (i), "m" (v->counter) : "memory");
161 return c;
162}
163
164/**
165 * atomic64_add_return - add and return
166 * @i: integer value to add
167 * @v: pointer to type atomic64_t
168 *
169 * Atomically adds @i to @v and returns @i + @v
170 */
171static inline long atomic64_add_return(long i, atomic64_t *v)
172{
173 long __i = i;
174 asm volatile(LOCK_PREFIX "xaddq %0, %1;"
175 : "+r" (i), "+m" (v->counter)
176 : : "memory");
177 return i + __i;
178}
179
180static inline long atomic64_sub_return(long i, atomic64_t *v)
181{
182 return atomic64_add_return(-i, v);
183}
184
185#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
186#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
187
188static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
189{
190 return cmpxchg(&v->counter, old, new);
191}
192
193static inline long atomic64_xchg(atomic64_t *v, long new)
194{
195 return xchg(&v->counter, new);
196}
197
198/**
199 * atomic64_add_unless - add unless the number is a given value
200 * @v: pointer of type atomic64_t
201 * @a: the amount to add to v...
202 * @u: ...unless v is equal to u.
203 *
204 * Atomically adds @a to @v, so long as it was not @u.
205 * Returns non-zero if @v was not @u, and zero otherwise.
206 */
207static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
208{
209 long c, old;
210 c = atomic64_read(v);
211 for (;;) {
212 if (unlikely(c == (u)))
213 break;
214 old = atomic64_cmpxchg((v), c, c + (a));
215 if (likely(old == c))
216 break;
217 c = old;
218 }
219 return c != (u);
220}
221
222#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
223
224#endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
deleted file mode 100644
index dc5a667ff791..000000000000
--- a/arch/x86/include/asm/atomic_32.h
+++ /dev/null
@@ -1,415 +0,0 @@
1#ifndef _ASM_X86_ATOMIC_32_H
2#define _ASM_X86_ATOMIC_32_H
3
4#include <linux/compiler.h>
5#include <linux/types.h>
6#include <asm/processor.h>
7#include <asm/cmpxchg.h>
8
9/*
10 * Atomic operations that C can't guarantee us. Useful for
11 * resource counting etc..
12 */
13
14#define ATOMIC_INIT(i) { (i) }
15
16/**
17 * atomic_read - read atomic variable
18 * @v: pointer of type atomic_t
19 *
20 * Atomically reads the value of @v.
21 */
22static inline int atomic_read(const atomic_t *v)
23{
24 return v->counter;
25}
26
27/**
28 * atomic_set - set atomic variable
29 * @v: pointer of type atomic_t
30 * @i: required value
31 *
32 * Atomically sets the value of @v to @i.
33 */
34static inline void atomic_set(atomic_t *v, int i)
35{
36 v->counter = i;
37}
38
39/**
40 * atomic_add - add integer to atomic variable
41 * @i: integer value to add
42 * @v: pointer of type atomic_t
43 *
44 * Atomically adds @i to @v.
45 */
46static inline void atomic_add(int i, atomic_t *v)
47{
48 asm volatile(LOCK_PREFIX "addl %1,%0"
49 : "+m" (v->counter)
50 : "ir" (i));
51}
52
53/**
54 * atomic_sub - subtract integer from atomic variable
55 * @i: integer value to subtract
56 * @v: pointer of type atomic_t
57 *
58 * Atomically subtracts @i from @v.
59 */
60static inline void atomic_sub(int i, atomic_t *v)
61{
62 asm volatile(LOCK_PREFIX "subl %1,%0"
63 : "+m" (v->counter)
64 : "ir" (i));
65}
66
67/**
68 * atomic_sub_and_test - subtract value from variable and test result
69 * @i: integer value to subtract
70 * @v: pointer of type atomic_t
71 *
72 * Atomically subtracts @i from @v and returns
73 * true if the result is zero, or false for all
74 * other cases.
75 */
76static inline int atomic_sub_and_test(int i, atomic_t *v)
77{
78 unsigned char c;
79
80 asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
81 : "+m" (v->counter), "=qm" (c)
82 : "ir" (i) : "memory");
83 return c;
84}
85
86/**
87 * atomic_inc - increment atomic variable
88 * @v: pointer of type atomic_t
89 *
90 * Atomically increments @v by 1.
91 */
92static inline void atomic_inc(atomic_t *v)
93{
94 asm volatile(LOCK_PREFIX "incl %0"
95 : "+m" (v->counter));
96}
97
98/**
99 * atomic_dec - decrement atomic variable
100 * @v: pointer of type atomic_t
101 *
102 * Atomically decrements @v by 1.
103 */
104static inline void atomic_dec(atomic_t *v)
105{
106 asm volatile(LOCK_PREFIX "decl %0"
107 : "+m" (v->counter));
108}
109
110/**
111 * atomic_dec_and_test - decrement and test
112 * @v: pointer of type atomic_t
113 *
114 * Atomically decrements @v by 1 and
115 * returns true if the result is 0, or false for all other
116 * cases.
117 */
118static inline int atomic_dec_and_test(atomic_t *v)
119{
120 unsigned char c;
121
122 asm volatile(LOCK_PREFIX "decl %0; sete %1"
123 : "+m" (v->counter), "=qm" (c)
124 : : "memory");
125 return c != 0;
126}
127
128/**
129 * atomic_inc_and_test - increment and test
130 * @v: pointer of type atomic_t
131 *
132 * Atomically increments @v by 1
133 * and returns true if the result is zero, or false for all
134 * other cases.
135 */
136static inline int atomic_inc_and_test(atomic_t *v)
137{
138 unsigned char c;
139
140 asm volatile(LOCK_PREFIX "incl %0; sete %1"
141 : "+m" (v->counter), "=qm" (c)
142 : : "memory");
143 return c != 0;
144}
145
146/**
147 * atomic_add_negative - add and test if negative
148 * @v: pointer of type atomic_t
149 * @i: integer value to add
150 *
151 * Atomically adds @i to @v and returns true
152 * if the result is negative, or false when
153 * result is greater than or equal to zero.
154 */
155static inline int atomic_add_negative(int i, atomic_t *v)
156{
157 unsigned char c;
158
159 asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
160 : "+m" (v->counter), "=qm" (c)
161 : "ir" (i) : "memory");
162 return c;
163}
164
165/**
166 * atomic_add_return - add integer and return
167 * @v: pointer of type atomic_t
168 * @i: integer value to add
169 *
170 * Atomically adds @i to @v and returns @i + @v
171 */
172static inline int atomic_add_return(int i, atomic_t *v)
173{
174 int __i;
175#ifdef CONFIG_M386
176 unsigned long flags;
177 if (unlikely(boot_cpu_data.x86 <= 3))
178 goto no_xadd;
179#endif
180 /* Modern 486+ processor */
181 __i = i;
182 asm volatile(LOCK_PREFIX "xaddl %0, %1"
183 : "+r" (i), "+m" (v->counter)
184 : : "memory");
185 return i + __i;
186
187#ifdef CONFIG_M386
188no_xadd: /* Legacy 386 processor */
189 local_irq_save(flags);
190 __i = atomic_read(v);
191 atomic_set(v, i + __i);
192 local_irq_restore(flags);
193 return i + __i;
194#endif
195}
196
197/**
198 * atomic_sub_return - subtract integer and return
199 * @v: pointer of type atomic_t
200 * @i: integer value to subtract
201 *
202 * Atomically subtracts @i from @v and returns @v - @i
203 */
204static inline int atomic_sub_return(int i, atomic_t *v)
205{
206 return atomic_add_return(-i, v);
207}
208
209static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
210{
211 return cmpxchg(&v->counter, old, new);
212}
213
214static inline int atomic_xchg(atomic_t *v, int new)
215{
216 return xchg(&v->counter, new);
217}
218
219/**
220 * atomic_add_unless - add unless the number is already a given value
221 * @v: pointer of type atomic_t
222 * @a: the amount to add to v...
223 * @u: ...unless v is equal to u.
224 *
225 * Atomically adds @a to @v, so long as @v was not already @u.
226 * Returns non-zero if @v was not @u, and zero otherwise.
227 */
228static inline int atomic_add_unless(atomic_t *v, int a, int u)
229{
230 int c, old;
231 c = atomic_read(v);
232 for (;;) {
233 if (unlikely(c == (u)))
234 break;
235 old = atomic_cmpxchg((v), c, c + (a));
236 if (likely(old == c))
237 break;
238 c = old;
239 }
240 return c != (u);
241}
242
243#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
244
245#define atomic_inc_return(v) (atomic_add_return(1, v))
246#define atomic_dec_return(v) (atomic_sub_return(1, v))
247
248/* These are x86-specific, used by some header files */
249#define atomic_clear_mask(mask, addr) \
250 asm volatile(LOCK_PREFIX "andl %0,%1" \
251 : : "r" (~(mask)), "m" (*(addr)) : "memory")
252
253#define atomic_set_mask(mask, addr) \
254 asm volatile(LOCK_PREFIX "orl %0,%1" \
255 : : "r" (mask), "m" (*(addr)) : "memory")
256
257/* Atomic operations are already serializing on x86 */
258#define smp_mb__before_atomic_dec() barrier()
259#define smp_mb__after_atomic_dec() barrier()
260#define smp_mb__before_atomic_inc() barrier()
261#define smp_mb__after_atomic_inc() barrier()
262
263/* An 64bit atomic type */
264
265typedef struct {
266 u64 __aligned(8) counter;
267} atomic64_t;
268
269#define ATOMIC64_INIT(val) { (val) }
270
271extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
272
273/**
274 * atomic64_xchg - xchg atomic64 variable
275 * @ptr: pointer to type atomic64_t
276 * @new_val: value to assign
277 *
278 * Atomically xchgs the value of @ptr to @new_val and returns
279 * the old value.
280 */
281extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
282
283/**
284 * atomic64_set - set atomic64 variable
285 * @ptr: pointer to type atomic64_t
286 * @new_val: value to assign
287 *
288 * Atomically sets the value of @ptr to @new_val.
289 */
290extern void atomic64_set(atomic64_t *ptr, u64 new_val);
291
292/**
293 * atomic64_read - read atomic64 variable
294 * @ptr: pointer to type atomic64_t
295 *
296 * Atomically reads the value of @ptr and returns it.
297 */
298static inline u64 atomic64_read(atomic64_t *ptr)
299{
300 u64 res;
301
302 /*
303 * Note, we inline this atomic64_t primitive because
304 * it only clobbers EAX/EDX and leaves the others
305 * untouched. We also (somewhat subtly) rely on the
306 * fact that cmpxchg8b returns the current 64-bit value
307 * of the memory location we are touching:
308 */
309 asm volatile(
310 "mov %%ebx, %%eax\n\t"
311 "mov %%ecx, %%edx\n\t"
312 LOCK_PREFIX "cmpxchg8b %1\n"
313 : "=&A" (res)
314 : "m" (*ptr)
315 );
316
317 return res;
318}
319
320extern u64 atomic64_read(atomic64_t *ptr);
321
322/**
323 * atomic64_add_return - add and return
324 * @delta: integer value to add
325 * @ptr: pointer to type atomic64_t
326 *
327 * Atomically adds @delta to @ptr and returns @delta + *@ptr
328 */
329extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
330
331/*
332 * Other variants with different arithmetic operators:
333 */
334extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
335extern u64 atomic64_inc_return(atomic64_t *ptr);
336extern u64 atomic64_dec_return(atomic64_t *ptr);
337
338/**
339 * atomic64_add - add integer to atomic64 variable
340 * @delta: integer value to add
341 * @ptr: pointer to type atomic64_t
342 *
343 * Atomically adds @delta to @ptr.
344 */
345extern void atomic64_add(u64 delta, atomic64_t *ptr);
346
347/**
348 * atomic64_sub - subtract the atomic64 variable
349 * @delta: integer value to subtract
350 * @ptr: pointer to type atomic64_t
351 *
352 * Atomically subtracts @delta from @ptr.
353 */
354extern void atomic64_sub(u64 delta, atomic64_t *ptr);
355
356/**
357 * atomic64_sub_and_test - subtract value from variable and test result
358 * @delta: integer value to subtract
359 * @ptr: pointer to type atomic64_t
360 *
361 * Atomically subtracts @delta from @ptr and returns
362 * true if the result is zero, or false for all
363 * other cases.
364 */
365extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
366
367/**
368 * atomic64_inc - increment atomic64 variable
369 * @ptr: pointer to type atomic64_t
370 *
371 * Atomically increments @ptr by 1.
372 */
373extern void atomic64_inc(atomic64_t *ptr);
374
375/**
376 * atomic64_dec - decrement atomic64 variable
377 * @ptr: pointer to type atomic64_t
378 *
379 * Atomically decrements @ptr by 1.
380 */
381extern void atomic64_dec(atomic64_t *ptr);
382
383/**
384 * atomic64_dec_and_test - decrement and test
385 * @ptr: pointer to type atomic64_t
386 *
387 * Atomically decrements @ptr by 1 and
388 * returns true if the result is 0, or false for all other
389 * cases.
390 */
391extern int atomic64_dec_and_test(atomic64_t *ptr);
392
393/**
394 * atomic64_inc_and_test - increment and test
395 * @ptr: pointer to type atomic64_t
396 *
397 * Atomically increments @ptr by 1
398 * and returns true if the result is zero, or false for all
399 * other cases.
400 */
401extern int atomic64_inc_and_test(atomic64_t *ptr);
402
403/**
404 * atomic64_add_negative - add and test if negative
405 * @delta: integer value to add
406 * @ptr: pointer to type atomic64_t
407 *
408 * Atomically adds @delta to @ptr and returns true
409 * if the result is negative, or false when
410 * result is greater than or equal to zero.
411 */
412extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
413
414#include <asm-generic/atomic-long.h>
415#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
deleted file mode 100644
index d605dc268e79..000000000000
--- a/arch/x86/include/asm/atomic_64.h
+++ /dev/null
@@ -1,485 +0,0 @@
1#ifndef _ASM_X86_ATOMIC_64_H
2#define _ASM_X86_ATOMIC_64_H
3
4#include <linux/types.h>
5#include <asm/alternative.h>
6#include <asm/cmpxchg.h>
7
8/*
9 * Atomic operations that C can't guarantee us. Useful for
10 * resource counting etc..
11 */
12
13#define ATOMIC_INIT(i) { (i) }
14
15/**
16 * atomic_read - read atomic variable
17 * @v: pointer of type atomic_t
18 *
19 * Atomically reads the value of @v.
20 */
21static inline int atomic_read(const atomic_t *v)
22{
23 return v->counter;
24}
25
26/**
27 * atomic_set - set atomic variable
28 * @v: pointer of type atomic_t
29 * @i: required value
30 *
31 * Atomically sets the value of @v to @i.
32 */
33static inline void atomic_set(atomic_t *v, int i)
34{
35 v->counter = i;
36}
37
38/**
39 * atomic_add - add integer to atomic variable
40 * @i: integer value to add
41 * @v: pointer of type atomic_t
42 *
43 * Atomically adds @i to @v.
44 */
45static inline void atomic_add(int i, atomic_t *v)
46{
47 asm volatile(LOCK_PREFIX "addl %1,%0"
48 : "=m" (v->counter)
49 : "ir" (i), "m" (v->counter));
50}
51
52/**
53 * atomic_sub - subtract the atomic variable
54 * @i: integer value to subtract
55 * @v: pointer of type atomic_t
56 *
57 * Atomically subtracts @i from @v.
58 */
59static inline void atomic_sub(int i, atomic_t *v)
60{
61 asm volatile(LOCK_PREFIX "subl %1,%0"
62 : "=m" (v->counter)
63 : "ir" (i), "m" (v->counter));
64}
65
66/**
67 * atomic_sub_and_test - subtract value from variable and test result
68 * @i: integer value to subtract
69 * @v: pointer of type atomic_t
70 *
71 * Atomically subtracts @i from @v and returns
72 * true if the result is zero, or false for all
73 * other cases.
74 */
75static inline int atomic_sub_and_test(int i, atomic_t *v)
76{
77 unsigned char c;
78
79 asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
80 : "=m" (v->counter), "=qm" (c)
81 : "ir" (i), "m" (v->counter) : "memory");
82 return c;
83}
84
85/**
86 * atomic_inc - increment atomic variable
87 * @v: pointer of type atomic_t
88 *
89 * Atomically increments @v by 1.
90 */
91static inline void atomic_inc(atomic_t *v)
92{
93 asm volatile(LOCK_PREFIX "incl %0"
94 : "=m" (v->counter)
95 : "m" (v->counter));
96}
97
98/**
99 * atomic_dec - decrement atomic variable
100 * @v: pointer of type atomic_t
101 *
102 * Atomically decrements @v by 1.
103 */
104static inline void atomic_dec(atomic_t *v)
105{
106 asm volatile(LOCK_PREFIX "decl %0"
107 : "=m" (v->counter)
108 : "m" (v->counter));
109}
110
111/**
112 * atomic_dec_and_test - decrement and test
113 * @v: pointer of type atomic_t
114 *
115 * Atomically decrements @v by 1 and
116 * returns true if the result is 0, or false for all other
117 * cases.
118 */
119static inline int atomic_dec_and_test(atomic_t *v)
120{
121 unsigned char c;
122
123 asm volatile(LOCK_PREFIX "decl %0; sete %1"
124 : "=m" (v->counter), "=qm" (c)
125 : "m" (v->counter) : "memory");
126 return c != 0;
127}
128
129/**
130 * atomic_inc_and_test - increment and test
131 * @v: pointer of type atomic_t
132 *
133 * Atomically increments @v by 1
134 * and returns true if the result is zero, or false for all
135 * other cases.
136 */
137static inline int atomic_inc_and_test(atomic_t *v)
138{
139 unsigned char c;
140
141 asm volatile(LOCK_PREFIX "incl %0; sete %1"
142 : "=m" (v->counter), "=qm" (c)
143 : "m" (v->counter) : "memory");
144 return c != 0;
145}
146
147/**
148 * atomic_add_negative - add and test if negative
149 * @i: integer value to add
150 * @v: pointer of type atomic_t
151 *
152 * Atomically adds @i to @v and returns true
153 * if the result is negative, or false when
154 * result is greater than or equal to zero.
155 */
156static inline int atomic_add_negative(int i, atomic_t *v)
157{
158 unsigned char c;
159
160 asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
161 : "=m" (v->counter), "=qm" (c)
162 : "ir" (i), "m" (v->counter) : "memory");
163 return c;
164}
165
166/**
167 * atomic_add_return - add and return
168 * @i: integer value to add
169 * @v: pointer of type atomic_t
170 *
171 * Atomically adds @i to @v and returns @i + @v
172 */
173static inline int atomic_add_return(int i, atomic_t *v)
174{
175 int __i = i;
176 asm volatile(LOCK_PREFIX "xaddl %0, %1"
177 : "+r" (i), "+m" (v->counter)
178 : : "memory");
179 return i + __i;
180}
181
182static inline int atomic_sub_return(int i, atomic_t *v)
183{
184 return atomic_add_return(-i, v);
185}
186
187#define atomic_inc_return(v) (atomic_add_return(1, v))
188#define atomic_dec_return(v) (atomic_sub_return(1, v))
189
190/* The 64-bit atomic type */
191
192#define ATOMIC64_INIT(i) { (i) }
193
194/**
195 * atomic64_read - read atomic64 variable
196 * @v: pointer of type atomic64_t
197 *
198 * Atomically reads the value of @v.
199 * Doesn't imply a read memory barrier.
200 */
201static inline long atomic64_read(const atomic64_t *v)
202{
203 return v->counter;
204}
205
206/**
207 * atomic64_set - set atomic64 variable
208 * @v: pointer to type atomic64_t
209 * @i: required value
210 *
211 * Atomically sets the value of @v to @i.
212 */
213static inline void atomic64_set(atomic64_t *v, long i)
214{
215 v->counter = i;
216}
217
218/**
219 * atomic64_add - add integer to atomic64 variable
220 * @i: integer value to add
221 * @v: pointer to type atomic64_t
222 *
223 * Atomically adds @i to @v.
224 */
225static inline void atomic64_add(long i, atomic64_t *v)
226{
227 asm volatile(LOCK_PREFIX "addq %1,%0"
228 : "=m" (v->counter)
229 : "er" (i), "m" (v->counter));
230}
231
232/**
233 * atomic64_sub - subtract the atomic64 variable
234 * @i: integer value to subtract
235 * @v: pointer to type atomic64_t
236 *
237 * Atomically subtracts @i from @v.
238 */
239static inline void atomic64_sub(long i, atomic64_t *v)
240{
241 asm volatile(LOCK_PREFIX "subq %1,%0"
242 : "=m" (v->counter)
243 : "er" (i), "m" (v->counter));
244}
245
246/**
247 * atomic64_sub_and_test - subtract value from variable and test result
248 * @i: integer value to subtract
249 * @v: pointer to type atomic64_t
250 *
251 * Atomically subtracts @i from @v and returns
252 * true if the result is zero, or false for all
253 * other cases.
254 */
255static inline int atomic64_sub_and_test(long i, atomic64_t *v)
256{
257 unsigned char c;
258
259 asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
260 : "=m" (v->counter), "=qm" (c)
261 : "er" (i), "m" (v->counter) : "memory");
262 return c;
263}
264
265/**
266 * atomic64_inc - increment atomic64 variable
267 * @v: pointer to type atomic64_t
268 *
269 * Atomically increments @v by 1.
270 */
271static inline void atomic64_inc(atomic64_t *v)
272{
273 asm volatile(LOCK_PREFIX "incq %0"
274 : "=m" (v->counter)
275 : "m" (v->counter));
276}
277
278/**
279 * atomic64_dec - decrement atomic64 variable
280 * @v: pointer to type atomic64_t
281 *
282 * Atomically decrements @v by 1.
283 */
284static inline void atomic64_dec(atomic64_t *v)
285{
286 asm volatile(LOCK_PREFIX "decq %0"
287 : "=m" (v->counter)
288 : "m" (v->counter));
289}
290
291/**
292 * atomic64_dec_and_test - decrement and test
293 * @v: pointer to type atomic64_t
294 *
295 * Atomically decrements @v by 1 and
296 * returns true if the result is 0, or false for all other
297 * cases.
298 */
299static inline int atomic64_dec_and_test(atomic64_t *v)
300{
301 unsigned char c;
302
303 asm volatile(LOCK_PREFIX "decq %0; sete %1"
304 : "=m" (v->counter), "=qm" (c)
305 : "m" (v->counter) : "memory");
306 return c != 0;
307}
308
309/**
310 * atomic64_inc_and_test - increment and test
311 * @v: pointer to type atomic64_t
312 *
313 * Atomically increments @v by 1
314 * and returns true if the result is zero, or false for all
315 * other cases.
316 */
317static inline int atomic64_inc_and_test(atomic64_t *v)
318{
319 unsigned char c;
320
321 asm volatile(LOCK_PREFIX "incq %0; sete %1"
322 : "=m" (v->counter), "=qm" (c)
323 : "m" (v->counter) : "memory");
324 return c != 0;
325}
326
327/**
328 * atomic64_add_negative - add and test if negative
329 * @i: integer value to add
330 * @v: pointer to type atomic64_t
331 *
332 * Atomically adds @i to @v and returns true
333 * if the result is negative, or false when
334 * result is greater than or equal to zero.
335 */
336static inline int atomic64_add_negative(long i, atomic64_t *v)
337{
338 unsigned char c;
339
340 asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
341 : "=m" (v->counter), "=qm" (c)
342 : "er" (i), "m" (v->counter) : "memory");
343 return c;
344}
345
346/**
347 * atomic64_add_return - add and return
348 * @i: integer value to add
349 * @v: pointer to type atomic64_t
350 *
351 * Atomically adds @i to @v and returns @i + @v
352 */
353static inline long atomic64_add_return(long i, atomic64_t *v)
354{
355 long __i = i;
356 asm volatile(LOCK_PREFIX "xaddq %0, %1;"
357 : "+r" (i), "+m" (v->counter)
358 : : "memory");
359 return i + __i;
360}
361
362static inline long atomic64_sub_return(long i, atomic64_t *v)
363{
364 return atomic64_add_return(-i, v);
365}
366
367#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
368#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
369
370static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
371{
372 return cmpxchg(&v->counter, old, new);
373}
374
375static inline long atomic64_xchg(atomic64_t *v, long new)
376{
377 return xchg(&v->counter, new);
378}
379
380static inline long atomic_cmpxchg(atomic_t *v, int old, int new)
381{
382 return cmpxchg(&v->counter, old, new);
383}
384
385static inline long atomic_xchg(atomic_t *v, int new)
386{
387 return xchg(&v->counter, new);
388}
389
390/**
391 * atomic_add_unless - add unless the number is a given value
392 * @v: pointer of type atomic_t
393 * @a: the amount to add to v...
394 * @u: ...unless v is equal to u.
395 *
396 * Atomically adds @a to @v, so long as it was not @u.
397 * Returns non-zero if @v was not @u, and zero otherwise.
398 */
399static inline int atomic_add_unless(atomic_t *v, int a, int u)
400{
401 int c, old;
402 c = atomic_read(v);
403 for (;;) {
404 if (unlikely(c == (u)))
405 break;
406 old = atomic_cmpxchg((v), c, c + (a));
407 if (likely(old == c))
408 break;
409 c = old;
410 }
411 return c != (u);
412}
413
414#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
415
416/**
417 * atomic64_add_unless - add unless the number is a given value
418 * @v: pointer of type atomic64_t
419 * @a: the amount to add to v...
420 * @u: ...unless v is equal to u.
421 *
422 * Atomically adds @a to @v, so long as it was not @u.
423 * Returns non-zero if @v was not @u, and zero otherwise.
424 */
425static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
426{
427 long c, old;
428 c = atomic64_read(v);
429 for (;;) {
430 if (unlikely(c == (u)))
431 break;
432 old = atomic64_cmpxchg((v), c, c + (a));
433 if (likely(old == c))
434 break;
435 c = old;
436 }
437 return c != (u);
438}
439
440/**
441 * atomic_inc_short - increment of a short integer
442 * @v: pointer to type int
443 *
444 * Atomically adds 1 to @v
445 * Returns the new value of @u
446 */
447static inline short int atomic_inc_short(short int *v)
448{
449 asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
450 return *v;
451}
452
453/**
454 * atomic_or_long - OR of two long integers
455 * @v1: pointer to type unsigned long
456 * @v2: pointer to type unsigned long
457 *
458 * Atomically ORs @v1 and @v2
459 * Returns the result of the OR
460 */
461static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
462{
463 asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2));
464}
465
466#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
467
468/* These are x86-specific, used by some header files */
469#define atomic_clear_mask(mask, addr) \
470 asm volatile(LOCK_PREFIX "andl %0,%1" \
471 : : "r" (~(mask)), "m" (*(addr)) : "memory")
472
473#define atomic_set_mask(mask, addr) \
474 asm volatile(LOCK_PREFIX "orl %0,%1" \
475 : : "r" ((unsigned)(mask)), "m" (*(addr)) \
476 : "memory")
477
478/* Atomic operations are already serializing on x86 */
479#define smp_mb__before_atomic_dec() barrier()
480#define smp_mb__after_atomic_dec() barrier()
481#define smp_mb__before_atomic_inc() barrier()
482#define smp_mb__after_atomic_inc() barrier()
483
484#include <asm-generic/atomic-long.h>
485#endif /* _ASM_X86_ATOMIC_64_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 637e1ec963c3..0cd82d068613 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -168,6 +168,10 @@
168#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ 168#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
169#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ 169#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
170#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ 170#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
171#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */
172#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */
173#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */
174#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */
171 175
172#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 176#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
173 177
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 8240f76b531e..b81002f23614 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -14,6 +14,9 @@
14 which debugging register was responsible for the trap. The other bits 14 which debugging register was responsible for the trap. The other bits
15 are either reserved or not of interest to us. */ 15 are either reserved or not of interest to us. */
16 16
17/* Define reserved bits in DR6 which are always set to 1 */
18#define DR6_RESERVED (0xFFFF0FF0)
19
17#define DR_TRAP0 (0x1) /* db0 */ 20#define DR_TRAP0 (0x1) /* db0 */
18#define DR_TRAP1 (0x2) /* db1 */ 21#define DR_TRAP1 (0x2) /* db1 */
19#define DR_TRAP2 (0x4) /* db2 */ 22#define DR_TRAP2 (0x4) /* db2 */
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 761249e396fe..0e22296790d3 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -111,11 +111,8 @@ extern unsigned long end_user_pfn;
111 111
112extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); 112extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
113extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); 113extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
114extern void reserve_early(u64 start, u64 end, char *name);
115extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
116extern void free_early(u64 start, u64 end);
117extern void early_res_to_bootmem(u64 start, u64 end);
118extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); 114extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
115#include <linux/early_res.h>
119 116
120extern unsigned long e820_end_of_ram_pfn(void); 117extern unsigned long e820_end_of_ram_pfn(void);
121extern unsigned long e820_end_of_low_ram_pfn(void); 118extern unsigned long e820_end_of_low_ram_pfn(void);
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 1994d3f58443..f2ad2163109d 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -170,10 +170,7 @@ static inline void elf_common_init(struct thread_struct *t,
170} 170}
171 171
172#define ELF_PLAT_INIT(_r, load_addr) \ 172#define ELF_PLAT_INIT(_r, load_addr) \
173do { \ 173 elf_common_init(&current->thread, _r, 0)
174 elf_common_init(&current->thread, _r, 0); \
175 clear_thread_flag(TIF_IA32); \
176} while (0)
177 174
178#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ 175#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
179 elf_common_init(&current->thread, regs, __USER_DS) 176 elf_common_init(&current->thread, regs, __USER_DS)
diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h
index 53018464aea6..2519d0679d99 100644
--- a/arch/x86/include/asm/fb.h
+++ b/arch/x86/include/asm/fb.h
@@ -12,10 +12,6 @@ static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
12 pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; 12 pgprot_val(vma->vm_page_prot) |= _PAGE_PCD;
13} 13}
14 14
15#ifdef CONFIG_X86_32
16extern int fb_is_primary_device(struct fb_info *info); 15extern int fb_is_primary_device(struct fb_info *info);
17#else
18static inline int fb_is_primary_device(struct fb_info *info) { return 0; }
19#endif
20 16
21#endif /* _ASM_X86_FB_H */ 17#endif /* _ASM_X86_FB_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 14f9890eb495..635f03bb4995 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -118,14 +118,20 @@ enum fixed_addresses {
118 * 256 temporary boot-time mappings, used by early_ioremap(), 118 * 256 temporary boot-time mappings, used by early_ioremap(),
119 * before ioremap() is functional. 119 * before ioremap() is functional.
120 * 120 *
121 * We round it up to the next 256 pages boundary so that we 121 * If necessary we round it up to the next 256 pages boundary so
122 * can have a single pgd entry and a single pte table: 122 * that we can have a single pgd entry and a single pte table:
123 */ 123 */
124#define NR_FIX_BTMAPS 64 124#define NR_FIX_BTMAPS 64
125#define FIX_BTMAPS_SLOTS 4 125#define FIX_BTMAPS_SLOTS 4
126 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - 126#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
127 (__end_of_permanent_fixed_addresses & 255), 127 FIX_BTMAP_END =
128 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, 128 (__end_of_permanent_fixed_addresses ^
129 (__end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - 1)) &
130 -PTRS_PER_PTE
131 ? __end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS -
132 (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1))
133 : __end_of_permanent_fixed_addresses,
134 FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
129#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT 135#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
130 FIX_OHCI1394_BASE, 136 FIX_OHCI1394_BASE,
131#endif 137#endif
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 014c2b85ae45..a726650fc80f 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -66,10 +66,6 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); 66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
67struct page *kmap_atomic_to_page(void *ptr); 67struct page *kmap_atomic_to_page(void *ptr);
68 68
69#ifndef CONFIG_PARAVIRT
70#define kmap_atomic_pte(page, type) kmap_atomic(page, type)
71#endif
72
73#define flush_cache_kmaps() do { } while (0) 69#define flush_cache_kmaps() do { } while (0)
74 70
75extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, 71extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h
new file mode 100644
index 000000000000..e153a2b3889a
--- /dev/null
+++ b/arch/x86/include/asm/hyperv.h
@@ -0,0 +1,186 @@
1#ifndef _ASM_X86_KVM_HYPERV_H
2#define _ASM_X86_KVM_HYPERV_H
3
4#include <linux/types.h>
5
6/*
7 * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
8 * is set by CPUID(HvCpuIdFunctionVersionAndFeatures).
9 */
10#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000
11#define HYPERV_CPUID_INTERFACE 0x40000001
12#define HYPERV_CPUID_VERSION 0x40000002
13#define HYPERV_CPUID_FEATURES 0x40000003
14#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
15#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
16
17/*
18 * Feature identification. EAX indicates which features are available
19 * to the partition based upon the current partition privileges.
20 */
21
22/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */
23#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0)
24/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
25#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
26/*
27 * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
28 * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
29 */
30#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2)
31/*
32 * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through
33 * HV_X64_MSR_STIMER3_COUNT) available
34 */
35#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3)
36/*
37 * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR)
38 * are available
39 */
40#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4)
41/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/
42#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5)
43/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/
44#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6)
45/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/
46#define HV_X64_MSR_RESET_AVAILABLE (1 << 7)
47 /*
48 * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE,
49 * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE,
50 * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available
51 */
52#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8)
53
54/*
55 * Feature identification: EBX indicates which flags were specified at
56 * partition creation. The format is the same as the partition creation
57 * flag structure defined in section Partition Creation Flags.
58 */
59#define HV_X64_CREATE_PARTITIONS (1 << 0)
60#define HV_X64_ACCESS_PARTITION_ID (1 << 1)
61#define HV_X64_ACCESS_MEMORY_POOL (1 << 2)
62#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3)
63#define HV_X64_POST_MESSAGES (1 << 4)
64#define HV_X64_SIGNAL_EVENTS (1 << 5)
65#define HV_X64_CREATE_PORT (1 << 6)
66#define HV_X64_CONNECT_PORT (1 << 7)
67#define HV_X64_ACCESS_STATS (1 << 8)
68#define HV_X64_DEBUGGING (1 << 11)
69#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12)
70#define HV_X64_CONFIGURE_PROFILER (1 << 13)
71
72/*
73 * Feature identification. EDX indicates which miscellaneous features
74 * are available to the partition.
75 */
76/* The MWAIT instruction is available (per section MONITOR / MWAIT) */
77#define HV_X64_MWAIT_AVAILABLE (1 << 0)
78/* Guest debugging support is available */
79#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1)
80/* Performance Monitor support is available*/
81#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2)
82/* Support for physical CPU dynamic partitioning events is available*/
83#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3)
84/*
85 * Support for passing hypercall input parameter block via XMM
86 * registers is available
87 */
88#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4)
89/* Support for a virtual guest idle state is available */
90#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
91
92/*
93 * Implementation recommendations. Indicates which behaviors the hypervisor
94 * recommends the OS implement for optimal performance.
95 */
96 /*
97 * Recommend using hypercall for address space switches rather
98 * than MOV to CR3 instruction
99 */
100#define HV_X64_MWAIT_RECOMMENDED (1 << 0)
101/* Recommend using hypercall for local TLB flushes rather
102 * than INVLPG or MOV to CR3 instructions */
103#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1)
104/*
105 * Recommend using hypercall for remote TLB flushes rather
106 * than inter-processor interrupts
107 */
108#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2)
109/*
110 * Recommend using MSRs for accessing APIC registers
111 * EOI, ICR and TPR rather than their memory-mapped counterparts
112 */
113#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3)
114/* Recommend using the hypervisor-provided MSR to initiate a system RESET */
115#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4)
116/*
117 * Recommend using relaxed timing for this partition. If used,
118 * the VM should disable any watchdog timeouts that rely on the
119 * timely delivery of external interrupts
120 */
121#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5)
122
123/* MSR used to identify the guest OS. */
124#define HV_X64_MSR_GUEST_OS_ID 0x40000000
125
126/* MSR used to setup pages used to communicate with the hypervisor. */
127#define HV_X64_MSR_HYPERCALL 0x40000001
128
129/* MSR used to provide vcpu index */
130#define HV_X64_MSR_VP_INDEX 0x40000002
131
132/* Define the virtual APIC registers */
133#define HV_X64_MSR_EOI 0x40000070
134#define HV_X64_MSR_ICR 0x40000071
135#define HV_X64_MSR_TPR 0x40000072
136#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073
137
138/* Define synthetic interrupt controller model specific registers. */
139#define HV_X64_MSR_SCONTROL 0x40000080
140#define HV_X64_MSR_SVERSION 0x40000081
141#define HV_X64_MSR_SIEFP 0x40000082
142#define HV_X64_MSR_SIMP 0x40000083
143#define HV_X64_MSR_EOM 0x40000084
144#define HV_X64_MSR_SINT0 0x40000090
145#define HV_X64_MSR_SINT1 0x40000091
146#define HV_X64_MSR_SINT2 0x40000092
147#define HV_X64_MSR_SINT3 0x40000093
148#define HV_X64_MSR_SINT4 0x40000094
149#define HV_X64_MSR_SINT5 0x40000095
150#define HV_X64_MSR_SINT6 0x40000096
151#define HV_X64_MSR_SINT7 0x40000097
152#define HV_X64_MSR_SINT8 0x40000098
153#define HV_X64_MSR_SINT9 0x40000099
154#define HV_X64_MSR_SINT10 0x4000009A
155#define HV_X64_MSR_SINT11 0x4000009B
156#define HV_X64_MSR_SINT12 0x4000009C
157#define HV_X64_MSR_SINT13 0x4000009D
158#define HV_X64_MSR_SINT14 0x4000009E
159#define HV_X64_MSR_SINT15 0x4000009F
160
161
162#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
163#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
164#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
165 (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
166
167/* Declare the various hypercall operations. */
168#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008
169
170#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
171#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
172#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
173 (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
174
175#define HV_PROCESSOR_POWER_STATE_C0 0
176#define HV_PROCESSOR_POWER_STATE_C1 1
177#define HV_PROCESSOR_POWER_STATE_C2 2
178#define HV_PROCESSOR_POWER_STATE_C3 3
179
180/* hypercall status code */
181#define HV_STATUS_SUCCESS 0
182#define HV_STATUS_INVALID_HYPERCALL_CODE 2
183#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
184#define HV_STATUS_INVALID_ALIGNMENT 4
185
186#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index ebfb8a9e11f7..da2930924501 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -33,8 +33,16 @@ extern void init_thread_xstate(void);
33extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); 33extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
34 34
35extern user_regset_active_fn fpregs_active, xfpregs_active; 35extern user_regset_active_fn fpregs_active, xfpregs_active;
36extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get; 36extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
37extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set; 37 xstateregs_get;
38extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
39 xstateregs_set;
40
41/*
42 * xstateregs_active == fpregs_active. Please refer to the comment
43 * at the definition of fpregs_active.
44 */
45#define xstateregs_active fpregs_active
38 46
39extern struct _fpx_sw_bytes fx_sw_reserved; 47extern struct _fpx_sw_bytes fx_sw_reserved;
40#ifdef CONFIG_IA32_EMULATION 48#ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 73739322b6d0..a1dcfa3ab17d 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -1,8 +1,42 @@
1#ifndef _ASM_X86_IO_H 1#ifndef _ASM_X86_IO_H
2#define _ASM_X86_IO_H 2#define _ASM_X86_IO_H
3 3
4/*
5 * This file contains the definitions for the x86 IO instructions
6 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
7 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
8 * versions of the single-IO instructions (inb_p/inw_p/..).
9 *
10 * This file is not meant to be obfuscating: it's just complicated
11 * to (a) handle it all in a way that makes gcc able to optimize it
12 * as well as possible and (b) trying to avoid writing the same thing
13 * over and over again with slight variations and possibly making a
14 * mistake somewhere.
15 */
16
17/*
18 * Thanks to James van Artsdalen for a better timing-fix than
19 * the two short jumps: using outb's to a nonexistent port seems
20 * to guarantee better timings even on fast machines.
21 *
22 * On the other hand, I'd like to be sure of a non-existent port:
23 * I feel a bit unsafe about using 0x80 (should be safe, though)
24 *
25 * Linus
26 */
27
28 /*
29 * Bit simplified and optimized by Jan Hubicka
30 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
31 *
32 * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
33 * isa_read[wl] and isa_write[wl] fixed
34 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
35 */
36
4#define ARCH_HAS_IOREMAP_WC 37#define ARCH_HAS_IOREMAP_WC
5 38
39#include <linux/string.h>
6#include <linux/compiler.h> 40#include <linux/compiler.h>
7#include <asm-generic/int-ll64.h> 41#include <asm-generic/int-ll64.h>
8#include <asm/page.h> 42#include <asm/page.h>
@@ -173,11 +207,126 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
173extern void iounmap(volatile void __iomem *addr); 207extern void iounmap(volatile void __iomem *addr);
174 208
175 209
176#ifdef CONFIG_X86_32 210#ifdef __KERNEL__
177# include "io_32.h" 211
212#include <asm-generic/iomap.h>
213
214#include <linux/vmalloc.h>
215
216/*
217 * Convert a virtual cached pointer to an uncached pointer
218 */
219#define xlate_dev_kmem_ptr(p) p
220
221static inline void
222memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
223{
224 memset((void __force *)addr, val, count);
225}
226
227static inline void
228memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
229{
230 memcpy(dst, (const void __force *)src, count);
231}
232
233static inline void
234memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
235{
236 memcpy((void __force *)dst, src, count);
237}
238
239/*
240 * ISA space is 'always mapped' on a typical x86 system, no need to
241 * explicitly ioremap() it. The fact that the ISA IO space is mapped
242 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
243 * are physical addresses. The following constant pointer can be
244 * used as the IO-area pointer (it can be iounmapped as well, so the
245 * analogy with PCI is quite large):
246 */
247#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
248
249/*
250 * Cache management
251 *
252 * This needed for two cases
253 * 1. Out of order aware processors
254 * 2. Accidentally out of order processors (PPro errata #51)
255 */
256
257static inline void flush_write_buffers(void)
258{
259#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
260 asm volatile("lock; addl $0,0(%%esp)": : :"memory");
261#endif
262}
263
264#endif /* __KERNEL__ */
265
266extern void native_io_delay(void);
267
268extern int io_delay_type;
269extern void io_delay_init(void);
270
271#if defined(CONFIG_PARAVIRT)
272#include <asm/paravirt.h>
178#else 273#else
179# include "io_64.h" 274
275static inline void slow_down_io(void)
276{
277 native_io_delay();
278#ifdef REALLY_SLOW_IO
279 native_io_delay();
280 native_io_delay();
281 native_io_delay();
180#endif 282#endif
283}
284
285#endif
286
287#define BUILDIO(bwl, bw, type) \
288static inline void out##bwl(unsigned type value, int port) \
289{ \
290 asm volatile("out" #bwl " %" #bw "0, %w1" \
291 : : "a"(value), "Nd"(port)); \
292} \
293 \
294static inline unsigned type in##bwl(int port) \
295{ \
296 unsigned type value; \
297 asm volatile("in" #bwl " %w1, %" #bw "0" \
298 : "=a"(value) : "Nd"(port)); \
299 return value; \
300} \
301 \
302static inline void out##bwl##_p(unsigned type value, int port) \
303{ \
304 out##bwl(value, port); \
305 slow_down_io(); \
306} \
307 \
308static inline unsigned type in##bwl##_p(int port) \
309{ \
310 unsigned type value = in##bwl(port); \
311 slow_down_io(); \
312 return value; \
313} \
314 \
315static inline void outs##bwl(int port, const void *addr, unsigned long count) \
316{ \
317 asm volatile("rep; outs" #bwl \
318 : "+S"(addr), "+c"(count) : "d"(port)); \
319} \
320 \
321static inline void ins##bwl(int port, void *addr, unsigned long count) \
322{ \
323 asm volatile("rep; ins" #bwl \
324 : "+D"(addr), "+c"(count) : "d"(port)); \
325}
326
327BUILDIO(b, b, char)
328BUILDIO(w, w, short)
329BUILDIO(l, , int)
181 330
182extern void *xlate_dev_mem_ptr(unsigned long phys); 331extern void *xlate_dev_mem_ptr(unsigned long phys);
183extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); 332extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h
deleted file mode 100644
index a299900f5920..000000000000
--- a/arch/x86/include/asm/io_32.h
+++ /dev/null
@@ -1,196 +0,0 @@
1#ifndef _ASM_X86_IO_32_H
2#define _ASM_X86_IO_32_H
3
4#include <linux/string.h>
5#include <linux/compiler.h>
6
7/*
8 * This file contains the definitions for the x86 IO instructions
9 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
10 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
11 * versions of the single-IO instructions (inb_p/inw_p/..).
12 *
13 * This file is not meant to be obfuscating: it's just complicated
14 * to (a) handle it all in a way that makes gcc able to optimize it
15 * as well as possible and (b) trying to avoid writing the same thing
16 * over and over again with slight variations and possibly making a
17 * mistake somewhere.
18 */
19
20/*
21 * Thanks to James van Artsdalen for a better timing-fix than
22 * the two short jumps: using outb's to a nonexistent port seems
23 * to guarantee better timings even on fast machines.
24 *
25 * On the other hand, I'd like to be sure of a non-existent port:
26 * I feel a bit unsafe about using 0x80 (should be safe, though)
27 *
28 * Linus
29 */
30
31 /*
32 * Bit simplified and optimized by Jan Hubicka
33 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
34 *
35 * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
36 * isa_read[wl] and isa_write[wl] fixed
37 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
38 */
39
40#define XQUAD_PORTIO_BASE 0xfe400000
41#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
42
43#ifdef __KERNEL__
44
45#include <asm-generic/iomap.h>
46
47#include <linux/vmalloc.h>
48
49/*
50 * Convert a virtual cached pointer to an uncached pointer
51 */
52#define xlate_dev_kmem_ptr(p) p
53
54static inline void
55memset_io(volatile void __iomem *addr, unsigned char val, int count)
56{
57 memset((void __force *)addr, val, count);
58}
59
60static inline void
61memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
62{
63 __memcpy(dst, (const void __force *)src, count);
64}
65
66static inline void
67memcpy_toio(volatile void __iomem *dst, const void *src, int count)
68{
69 __memcpy((void __force *)dst, src, count);
70}
71
72/*
73 * ISA space is 'always mapped' on a typical x86 system, no need to
74 * explicitly ioremap() it. The fact that the ISA IO space is mapped
75 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
76 * are physical addresses. The following constant pointer can be
77 * used as the IO-area pointer (it can be iounmapped as well, so the
78 * analogy with PCI is quite large):
79 */
80#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
81
82/*
83 * Cache management
84 *
85 * This needed for two cases
86 * 1. Out of order aware processors
87 * 2. Accidentally out of order processors (PPro errata #51)
88 */
89
90#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
91
92static inline void flush_write_buffers(void)
93{
94 asm volatile("lock; addl $0,0(%%esp)": : :"memory");
95}
96
97#else
98
99#define flush_write_buffers() do { } while (0)
100
101#endif
102
103#endif /* __KERNEL__ */
104
105extern void native_io_delay(void);
106
107extern int io_delay_type;
108extern void io_delay_init(void);
109
110#if defined(CONFIG_PARAVIRT)
111#include <asm/paravirt.h>
112#else
113
114static inline void slow_down_io(void)
115{
116 native_io_delay();
117#ifdef REALLY_SLOW_IO
118 native_io_delay();
119 native_io_delay();
120 native_io_delay();
121#endif
122}
123
124#endif
125
126#define __BUILDIO(bwl, bw, type) \
127static inline void out##bwl(unsigned type value, int port) \
128{ \
129 out##bwl##_local(value, port); \
130} \
131 \
132static inline unsigned type in##bwl(int port) \
133{ \
134 return in##bwl##_local(port); \
135}
136
137#define BUILDIO(bwl, bw, type) \
138static inline void out##bwl##_local(unsigned type value, int port) \
139{ \
140 asm volatile("out" #bwl " %" #bw "0, %w1" \
141 : : "a"(value), "Nd"(port)); \
142} \
143 \
144static inline unsigned type in##bwl##_local(int port) \
145{ \
146 unsigned type value; \
147 asm volatile("in" #bwl " %w1, %" #bw "0" \
148 : "=a"(value) : "Nd"(port)); \
149 return value; \
150} \
151 \
152static inline void out##bwl##_local_p(unsigned type value, int port) \
153{ \
154 out##bwl##_local(value, port); \
155 slow_down_io(); \
156} \
157 \
158static inline unsigned type in##bwl##_local_p(int port) \
159{ \
160 unsigned type value = in##bwl##_local(port); \
161 slow_down_io(); \
162 return value; \
163} \
164 \
165__BUILDIO(bwl, bw, type) \
166 \
167static inline void out##bwl##_p(unsigned type value, int port) \
168{ \
169 out##bwl(value, port); \
170 slow_down_io(); \
171} \
172 \
173static inline unsigned type in##bwl##_p(int port) \
174{ \
175 unsigned type value = in##bwl(port); \
176 slow_down_io(); \
177 return value; \
178} \
179 \
180static inline void outs##bwl(int port, const void *addr, unsigned long count) \
181{ \
182 asm volatile("rep; outs" #bwl \
183 : "+S"(addr), "+c"(count) : "d"(port)); \
184} \
185 \
186static inline void ins##bwl(int port, void *addr, unsigned long count) \
187{ \
188 asm volatile("rep; ins" #bwl \
189 : "+D"(addr), "+c"(count) : "d"(port)); \
190}
191
192BUILDIO(b, b, char)
193BUILDIO(w, w, short)
194BUILDIO(l, , int)
195
196#endif /* _ASM_X86_IO_32_H */
diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h
deleted file mode 100644
index 244067893af4..000000000000
--- a/arch/x86/include/asm/io_64.h
+++ /dev/null
@@ -1,181 +0,0 @@
1#ifndef _ASM_X86_IO_64_H
2#define _ASM_X86_IO_64_H
3
4
5/*
6 * This file contains the definitions for the x86 IO instructions
7 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
8 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
9 * versions of the single-IO instructions (inb_p/inw_p/..).
10 *
11 * This file is not meant to be obfuscating: it's just complicated
12 * to (a) handle it all in a way that makes gcc able to optimize it
13 * as well as possible and (b) trying to avoid writing the same thing
14 * over and over again with slight variations and possibly making a
15 * mistake somewhere.
16 */
17
18/*
19 * Thanks to James van Artsdalen for a better timing-fix than
20 * the two short jumps: using outb's to a nonexistent port seems
21 * to guarantee better timings even on fast machines.
22 *
23 * On the other hand, I'd like to be sure of a non-existent port:
24 * I feel a bit unsafe about using 0x80 (should be safe, though)
25 *
26 * Linus
27 */
28
29 /*
30 * Bit simplified and optimized by Jan Hubicka
31 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
32 *
33 * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
34 * isa_read[wl] and isa_write[wl] fixed
35 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
36 */
37
38extern void native_io_delay(void);
39
40extern int io_delay_type;
41extern void io_delay_init(void);
42
43#if defined(CONFIG_PARAVIRT)
44#include <asm/paravirt.h>
45#else
46
47static inline void slow_down_io(void)
48{
49 native_io_delay();
50#ifdef REALLY_SLOW_IO
51 native_io_delay();
52 native_io_delay();
53 native_io_delay();
54#endif
55}
56#endif
57
58/*
59 * Talk about misusing macros..
60 */
61#define __OUT1(s, x) \
62static inline void out##s(unsigned x value, unsigned short port) {
63
64#define __OUT2(s, s1, s2) \
65asm volatile ("out" #s " %" s1 "0,%" s2 "1"
66
67#ifndef REALLY_SLOW_IO
68#define REALLY_SLOW_IO
69#define UNSET_REALLY_SLOW_IO
70#endif
71
72#define __OUT(s, s1, x) \
73 __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
74 } \
75 __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
76 slow_down_io(); \
77}
78
79#define __IN1(s) \
80static inline RETURN_TYPE in##s(unsigned short port) \
81{ \
82 RETURN_TYPE _v;
83
84#define __IN2(s, s1, s2) \
85 asm volatile ("in" #s " %" s2 "1,%" s1 "0"
86
87#define __IN(s, s1, i...) \
88 __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
89 return _v; \
90 } \
91 __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
92 slow_down_io(); \
93 return _v; }
94
95#ifdef UNSET_REALLY_SLOW_IO
96#undef REALLY_SLOW_IO
97#endif
98
99#define __INS(s) \
100static inline void ins##s(unsigned short port, void *addr, \
101 unsigned long count) \
102{ \
103 asm volatile ("rep ; ins" #s \
104 : "=D" (addr), "=c" (count) \
105 : "d" (port), "0" (addr), "1" (count)); \
106}
107
108#define __OUTS(s) \
109static inline void outs##s(unsigned short port, const void *addr, \
110 unsigned long count) \
111{ \
112 asm volatile ("rep ; outs" #s \
113 : "=S" (addr), "=c" (count) \
114 : "d" (port), "0" (addr), "1" (count)); \
115}
116
117#define RETURN_TYPE unsigned char
118__IN(b, "")
119#undef RETURN_TYPE
120#define RETURN_TYPE unsigned short
121__IN(w, "")
122#undef RETURN_TYPE
123#define RETURN_TYPE unsigned int
124__IN(l, "")
125#undef RETURN_TYPE
126
127__OUT(b, "b", char)
128__OUT(w, "w", short)
129__OUT(l, , int)
130
131__INS(b)
132__INS(w)
133__INS(l)
134
135__OUTS(b)
136__OUTS(w)
137__OUTS(l)
138
139#if defined(__KERNEL__) && defined(__x86_64__)
140
141#include <linux/vmalloc.h>
142
143#include <asm-generic/iomap.h>
144
145void __memcpy_fromio(void *, unsigned long, unsigned);
146void __memcpy_toio(unsigned long, const void *, unsigned);
147
148static inline void memcpy_fromio(void *to, const volatile void __iomem *from,
149 unsigned len)
150{
151 __memcpy_fromio(to, (unsigned long)from, len);
152}
153
154static inline void memcpy_toio(volatile void __iomem *to, const void *from,
155 unsigned len)
156{
157 __memcpy_toio((unsigned long)to, from, len);
158}
159
160void memset_io(volatile void __iomem *a, int b, size_t c);
161
162/*
163 * ISA space is 'always mapped' on a typical x86 system, no need to
164 * explicitly ioremap() it. The fact that the ISA IO space is mapped
165 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
166 * are physical addresses. The following constant pointer can be
167 * used as the IO-area pointer (it can be iounmapped as well, so the
168 * analogy with PCI is quite large):
169 */
170#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
171
172#define flush_write_buffers()
173
174/*
175 * Convert a virtual cached pointer to an uncached pointer
176 */
177#define xlate_dev_kmem_ptr(p) p
178
179#endif /* __KERNEL__ */
180
181#endif /* _ASM_X86_IO_64_H */
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 4fe681de1e76..4ffa345a8ccb 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -32,7 +32,10 @@ struct kprobe;
32 32
33typedef u8 kprobe_opcode_t; 33typedef u8 kprobe_opcode_t;
34#define BREAKPOINT_INSTRUCTION 0xcc 34#define BREAKPOINT_INSTRUCTION 0xcc
35#define RELATIVEJUMP_INSTRUCTION 0xe9 35#define RELATIVEJUMP_OPCODE 0xe9
36#define RELATIVEJUMP_SIZE 5
37#define RELATIVECALL_OPCODE 0xe8
38#define RELATIVE_ADDR_SIZE 4
36#define MAX_INSN_SIZE 16 39#define MAX_INSN_SIZE 16
37#define MAX_STACK_SIZE 64 40#define MAX_STACK_SIZE 64
38#define MIN_STACK_SIZE(ADDR) \ 41#define MIN_STACK_SIZE(ADDR) \
@@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t;
44 47
45#define flush_insn_slot(p) do { } while (0) 48#define flush_insn_slot(p) do { } while (0)
46 49
50/* optinsn template addresses */
51extern kprobe_opcode_t optprobe_template_entry;
52extern kprobe_opcode_t optprobe_template_val;
53extern kprobe_opcode_t optprobe_template_call;
54extern kprobe_opcode_t optprobe_template_end;
55#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE)
56#define MAX_OPTINSN_SIZE \
57 (((unsigned long)&optprobe_template_end - \
58 (unsigned long)&optprobe_template_entry) + \
59 MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE)
60
47extern const int kretprobe_blacklist_size; 61extern const int kretprobe_blacklist_size;
48 62
49void arch_remove_kprobe(struct kprobe *p); 63void arch_remove_kprobe(struct kprobe *p);
@@ -64,6 +78,21 @@ struct arch_specific_insn {
64 int boostable; 78 int boostable;
65}; 79};
66 80
81struct arch_optimized_insn {
82 /* copy of the original instructions */
83 kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE];
84 /* detour code buffer */
85 kprobe_opcode_t *insn;
86 /* the size of instructions copied to detour code buffer */
87 size_t size;
88};
89
90/* Return true (!0) if optinsn is prepared for optimization. */
91static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
92{
93 return optinsn->size;
94}
95
67struct prev_kprobe { 96struct prev_kprobe {
68 struct kprobe *kp; 97 struct kprobe *kp;
69 unsigned long status; 98 unsigned long status;
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7c18e1230f54..7a6f54fa13ba 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -54,13 +54,23 @@ struct x86_emulate_ctxt;
54struct x86_emulate_ops { 54struct x86_emulate_ops {
55 /* 55 /*
56 * read_std: Read bytes of standard (non-emulated/special) memory. 56 * read_std: Read bytes of standard (non-emulated/special) memory.
57 * Used for instruction fetch, stack operations, and others. 57 * Used for descriptor reading.
58 * @addr: [IN ] Linear address from which to read. 58 * @addr: [IN ] Linear address from which to read.
59 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 59 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
60 * @bytes: [IN ] Number of bytes to read from memory. 60 * @bytes: [IN ] Number of bytes to read from memory.
61 */ 61 */
62 int (*read_std)(unsigned long addr, void *val, 62 int (*read_std)(unsigned long addr, void *val,
63 unsigned int bytes, struct kvm_vcpu *vcpu); 63 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
64
65 /*
66 * fetch: Read bytes of standard (non-emulated/special) memory.
67 * Used for instruction fetch.
68 * @addr: [IN ] Linear address from which to read.
69 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
70 * @bytes: [IN ] Number of bytes to read from memory.
71 */
72 int (*fetch)(unsigned long addr, void *val,
73 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
64 74
65 /* 75 /*
66 * read_emulated: Read bytes from emulated/special memory area. 76 * read_emulated: Read bytes from emulated/special memory area.
@@ -74,7 +84,7 @@ struct x86_emulate_ops {
74 struct kvm_vcpu *vcpu); 84 struct kvm_vcpu *vcpu);
75 85
76 /* 86 /*
77 * write_emulated: Read bytes from emulated/special memory area. 87 * write_emulated: Write bytes to emulated/special memory area.
78 * @addr: [IN ] Linear address to which to write. 88 * @addr: [IN ] Linear address to which to write.
79 * @val: [IN ] Value to write to memory (low-order bytes used as 89 * @val: [IN ] Value to write to memory (low-order bytes used as
80 * required). 90 * required).
@@ -168,6 +178,7 @@ struct x86_emulate_ctxt {
168 178
169/* Execution mode, passed to the emulator. */ 179/* Execution mode, passed to the emulator. */
170#define X86EMUL_MODE_REAL 0 /* Real mode. */ 180#define X86EMUL_MODE_REAL 0 /* Real mode. */
181#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */
171#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ 182#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
172#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ 183#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
173#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ 184#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4f865e8b8540..06d9e79ca37d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -25,7 +25,7 @@
25#include <asm/mtrr.h> 25#include <asm/mtrr.h>
26#include <asm/msr-index.h> 26#include <asm/msr-index.h>
27 27
28#define KVM_MAX_VCPUS 16 28#define KVM_MAX_VCPUS 64
29#define KVM_MEMORY_SLOTS 32 29#define KVM_MEMORY_SLOTS 32
30/* memory slots that does not exposed to userspace */ 30/* memory slots that does not exposed to userspace */
31#define KVM_PRIVATE_MEM_SLOTS 4 31#define KVM_PRIVATE_MEM_SLOTS 4
@@ -38,19 +38,6 @@
38#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 38#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
39 0xFFFFFF0000000000ULL) 39 0xFFFFFF0000000000ULL)
40 40
41#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
42 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
43#define KVM_GUEST_CR0_MASK \
44 (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
45#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
46 (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
47#define KVM_VM_CR0_ALWAYS_ON \
48 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
49#define KVM_GUEST_CR4_MASK \
50 (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
51#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
52#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
53
54#define INVALID_PAGE (~(hpa_t)0) 41#define INVALID_PAGE (~(hpa_t)0)
55#define UNMAPPED_GVA (~(gpa_t)0) 42#define UNMAPPED_GVA (~(gpa_t)0)
56 43
@@ -256,7 +243,8 @@ struct kvm_mmu {
256 void (*new_cr3)(struct kvm_vcpu *vcpu); 243 void (*new_cr3)(struct kvm_vcpu *vcpu);
257 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); 244 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
258 void (*free)(struct kvm_vcpu *vcpu); 245 void (*free)(struct kvm_vcpu *vcpu);
259 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); 246 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
247 u32 *error);
260 void (*prefetch_page)(struct kvm_vcpu *vcpu, 248 void (*prefetch_page)(struct kvm_vcpu *vcpu,
261 struct kvm_mmu_page *page); 249 struct kvm_mmu_page *page);
262 int (*sync_page)(struct kvm_vcpu *vcpu, 250 int (*sync_page)(struct kvm_vcpu *vcpu,
@@ -282,13 +270,15 @@ struct kvm_vcpu_arch {
282 u32 regs_dirty; 270 u32 regs_dirty;
283 271
284 unsigned long cr0; 272 unsigned long cr0;
273 unsigned long cr0_guest_owned_bits;
285 unsigned long cr2; 274 unsigned long cr2;
286 unsigned long cr3; 275 unsigned long cr3;
287 unsigned long cr4; 276 unsigned long cr4;
277 unsigned long cr4_guest_owned_bits;
288 unsigned long cr8; 278 unsigned long cr8;
289 u32 hflags; 279 u32 hflags;
290 u64 pdptrs[4]; /* pae */ 280 u64 pdptrs[4]; /* pae */
291 u64 shadow_efer; 281 u64 efer;
292 u64 apic_base; 282 u64 apic_base;
293 struct kvm_lapic *apic; /* kernel irqchip context */ 283 struct kvm_lapic *apic; /* kernel irqchip context */
294 int32_t apic_arb_prio; 284 int32_t apic_arb_prio;
@@ -374,17 +364,27 @@ struct kvm_vcpu_arch {
374 /* used for guest single stepping over the given code position */ 364 /* used for guest single stepping over the given code position */
375 u16 singlestep_cs; 365 u16 singlestep_cs;
376 unsigned long singlestep_rip; 366 unsigned long singlestep_rip;
367 /* fields used by HYPER-V emulation */
368 u64 hv_vapic;
377}; 369};
378 370
379struct kvm_mem_alias { 371struct kvm_mem_alias {
380 gfn_t base_gfn; 372 gfn_t base_gfn;
381 unsigned long npages; 373 unsigned long npages;
382 gfn_t target_gfn; 374 gfn_t target_gfn;
375#define KVM_ALIAS_INVALID 1UL
376 unsigned long flags;
383}; 377};
384 378
385struct kvm_arch{ 379#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
386 int naliases; 380
381struct kvm_mem_aliases {
387 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; 382 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
383 int naliases;
384};
385
386struct kvm_arch {
387 struct kvm_mem_aliases *aliases;
388 388
389 unsigned int n_free_mmu_pages; 389 unsigned int n_free_mmu_pages;
390 unsigned int n_requested_mmu_pages; 390 unsigned int n_requested_mmu_pages;
@@ -416,6 +416,10 @@ struct kvm_arch{
416 s64 kvmclock_offset; 416 s64 kvmclock_offset;
417 417
418 struct kvm_xen_hvm_config xen_hvm_config; 418 struct kvm_xen_hvm_config xen_hvm_config;
419
420 /* fields used by HYPER-V emulation */
421 u64 hv_guest_os_id;
422 u64 hv_hypercall;
419}; 423};
420 424
421struct kvm_vm_stat { 425struct kvm_vm_stat {
@@ -471,6 +475,7 @@ struct kvm_x86_ops {
471 int (*hardware_setup)(void); /* __init */ 475 int (*hardware_setup)(void); /* __init */
472 void (*hardware_unsetup)(void); /* __exit */ 476 void (*hardware_unsetup)(void); /* __exit */
473 bool (*cpu_has_accelerated_tpr)(void); 477 bool (*cpu_has_accelerated_tpr)(void);
478 void (*cpuid_update)(struct kvm_vcpu *vcpu);
474 479
475 /* Create, but do not attach this VCPU */ 480 /* Create, but do not attach this VCPU */
476 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); 481 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
@@ -492,6 +497,7 @@ struct kvm_x86_ops {
492 void (*set_segment)(struct kvm_vcpu *vcpu, 497 void (*set_segment)(struct kvm_vcpu *vcpu,
493 struct kvm_segment *var, int seg); 498 struct kvm_segment *var, int seg);
494 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); 499 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
500 void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
495 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); 501 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
496 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); 502 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
497 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 503 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -501,12 +507,13 @@ struct kvm_x86_ops {
501 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 507 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
502 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 508 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
503 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 509 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
504 unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); 510 int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
505 void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, 511 int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
506 int *exception);
507 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); 512 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
508 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); 513 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
509 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 514 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
515 void (*fpu_activate)(struct kvm_vcpu *vcpu);
516 void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
510 517
511 void (*tlb_flush)(struct kvm_vcpu *vcpu); 518 void (*tlb_flush)(struct kvm_vcpu *vcpu);
512 519
@@ -531,7 +538,8 @@ struct kvm_x86_ops {
531 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 538 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
532 int (*get_tdp_level)(void); 539 int (*get_tdp_level)(void);
533 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 540 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
534 bool (*gb_page_enable)(void); 541 int (*get_lpage_level)(void);
542 bool (*rdtscp_supported)(void);
535 543
536 const struct trace_print_flags *exit_reasons_str; 544 const struct trace_print_flags *exit_reasons_str;
537}; 545};
@@ -606,8 +614,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
606 unsigned long value); 614 unsigned long value);
607 615
608void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 616void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
609int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 617int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
610 int type_bits, int seg);
611 618
612int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); 619int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
613 620
@@ -653,6 +660,10 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
653int kvm_mmu_load(struct kvm_vcpu *vcpu); 660int kvm_mmu_load(struct kvm_vcpu *vcpu);
654void kvm_mmu_unload(struct kvm_vcpu *vcpu); 661void kvm_mmu_unload(struct kvm_vcpu *vcpu);
655void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 662void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
663gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
664gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
665gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
666gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
656 667
657int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 668int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
658 669
@@ -666,6 +677,7 @@ void kvm_disable_tdp(void);
666 677
667int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); 678int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
668int complete_pio(struct kvm_vcpu *vcpu); 679int complete_pio(struct kvm_vcpu *vcpu);
680bool kvm_check_iopl(struct kvm_vcpu *vcpu);
669 681
670struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); 682struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
671 683
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c584076a47f4..ffae1420e7d7 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_KVM_PARA_H 2#define _ASM_X86_KVM_PARA_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <asm/hyperv.h>
5 6
6/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It 7/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
7 * should be used to determine that a VM is running under KVM. 8 * should be used to determine that a VM is running under KVM.
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 47b9b6f19057..2e9972468a5d 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -195,41 +195,4 @@ static inline long local_sub_return(long i, local_t *l)
195#define __local_add(i, l) local_add((i), (l)) 195#define __local_add(i, l) local_add((i), (l))
196#define __local_sub(i, l) local_sub((i), (l)) 196#define __local_sub(i, l) local_sub((i), (l))
197 197
198/* Use these for per-cpu local_t variables: on some archs they are
199 * much more efficient than these naive implementations. Note they take
200 * a variable, not an address.
201 *
202 * X86_64: This could be done better if we moved the per cpu data directly
203 * after GS.
204 */
205
206/* Need to disable preemption for the cpu local counters otherwise we could
207 still access a variable of a previous CPU in a non atomic way. */
208#define cpu_local_wrap_v(l) \
209({ \
210 local_t res__; \
211 preempt_disable(); \
212 res__ = (l); \
213 preempt_enable(); \
214 res__; \
215})
216#define cpu_local_wrap(l) \
217({ \
218 preempt_disable(); \
219 (l); \
220 preempt_enable(); \
221}) \
222
223#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var((l))))
224#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var((l)), (i)))
225#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var((l))))
226#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var((l))))
227#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var((l))))
228#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var((l))))
229
230#define __cpu_local_inc(l) cpu_local_inc((l))
231#define __cpu_local_dec(l) cpu_local_dec((l))
232#define __cpu_local_add(i, l) cpu_local_add((i), (l))
233#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
234
235#endif /* _ASM_X86_LOCAL_H */ 198#endif /* _ASM_X86_LOCAL_H */
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index a29f48c2a322..288b96f815a6 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -39,11 +39,5 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
40#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ 40#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
41 NODE_DATA(nid)->node_spanned_pages) 41 NODE_DATA(nid)->node_spanned_pages)
42
43#ifdef CONFIG_NUMA_EMU
44#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024)
45#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
46#endif
47
48#endif 42#endif
49#endif /* _ASM_X86_MMZONE_64_H */ 43#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 139d4c1a33a7..93da9c3f3341 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -19,7 +19,6 @@ extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
19extern int check_nmi_watchdog(void); 19extern int check_nmi_watchdog(void);
20extern int nmi_watchdog_enabled; 20extern int nmi_watchdog_enabled;
21extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); 21extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
22extern int avail_to_resrv_perfctr_nmi(unsigned int);
23extern int reserve_perfctr_nmi(unsigned int); 22extern int reserve_perfctr_nmi(unsigned int);
24extern void release_perfctr_nmi(unsigned int); 23extern void release_perfctr_nmi(unsigned int);
25extern int reserve_evntsel_nmi(unsigned int); 24extern int reserve_evntsel_nmi(unsigned int);
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index c4ae822e415f..823e070e7c26 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -36,6 +36,11 @@ extern void __cpuinit numa_set_node(int cpu, int node);
36extern void __cpuinit numa_clear_node(int cpu); 36extern void __cpuinit numa_clear_node(int cpu);
37extern void __cpuinit numa_add_cpu(int cpu); 37extern void __cpuinit numa_add_cpu(int cpu);
38extern void __cpuinit numa_remove_cpu(int cpu); 38extern void __cpuinit numa_remove_cpu(int cpu);
39
40#ifdef CONFIG_NUMA_EMU
41#define FAKE_NODE_MIN_SIZE ((u64)64 << 20)
42#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
43#endif /* CONFIG_NUMA_EMU */
39#else 44#else
40static inline void init_cpu_to_node(void) { } 45static inline void init_cpu_to_node(void) { }
41static inline void numa_set_node(int cpu, int node) { } 46static inline void numa_set_node(int cpu, int node) { }
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
index ef6bf81f8460..37c516545ec8 100644
--- a/arch/x86/include/asm/numaq.h
+++ b/arch/x86/include/asm/numaq.h
@@ -34,6 +34,10 @@ extern int pci_numaq_init(void);
34 34
35extern void *xquad_portio; 35extern void *xquad_portio;
36 36
37#define XQUAD_PORTIO_BASE 0xfe400000
38#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
39#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
40
37/* 41/*
38 * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the 42 * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the
39 */ 43 */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 642fe34b36a2..a667f24c7254 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,7 +40,6 @@
40 40
41#ifndef __ASSEMBLY__ 41#ifndef __ASSEMBLY__
42 42
43extern int page_is_ram(unsigned long pagenr);
44extern int devmem_is_allowed(unsigned long pagenr); 43extern int devmem_is_allowed(unsigned long pagenr);
45 44
46extern unsigned long max_low_pfn_mapped; 45extern unsigned long max_low_pfn_mapped;
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index dd59a85a918f..5653f43d90e5 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -435,15 +435,6 @@ static inline void paravirt_release_pud(unsigned long pfn)
435 PVOP_VCALL1(pv_mmu_ops.release_pud, pfn); 435 PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
436} 436}
437 437
438#ifdef CONFIG_HIGHPTE
439static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
440{
441 unsigned long ret;
442 ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
443 return (void *)ret;
444}
445#endif
446
447static inline void pte_update(struct mm_struct *mm, unsigned long addr, 438static inline void pte_update(struct mm_struct *mm, unsigned long addr,
448 pte_t *ptep) 439 pte_t *ptep)
449{ 440{
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b1e70d51e40c..db9ef5532341 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -304,10 +304,6 @@ struct pv_mmu_ops {
304#endif /* PAGETABLE_LEVELS == 4 */ 304#endif /* PAGETABLE_LEVELS == 4 */
305#endif /* PAGETABLE_LEVELS >= 3 */ 305#endif /* PAGETABLE_LEVELS >= 3 */
306 306
307#ifdef CONFIG_HIGHPTE
308 void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
309#endif
310
311 struct pv_lazy_ops lazy_mode; 307 struct pv_lazy_ops lazy_mode;
312 308
313 /* dom0 ops */ 309 /* dom0 ops */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 8bd433ccc242..3e002ca5a287 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -131,6 +131,8 @@ extern void pci_iommu_alloc(void);
131#include "pci_64.h" 131#include "pci_64.h"
132#endif 132#endif
133 133
134void dma32_reserve_bootmem(void);
135
134/* implement the pci_ DMA API in terms of the generic device dma_ one */ 136/* implement the pci_ DMA API in terms of the generic device dma_ one */
135#include <asm-generic/pci-dma-compat.h> 137#include <asm-generic/pci-dma-compat.h>
136 138
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
index ae5e40f67daf..fe15cfb21b9b 100644
--- a/arch/x86/include/asm/pci_64.h
+++ b/arch/x86/include/asm/pci_64.h
@@ -22,8 +22,6 @@ extern int (*pci_config_read)(int seg, int bus, int dev, int fn,
22extern int (*pci_config_write)(int seg, int bus, int dev, int fn, 22extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
23 int reg, int len, u32 value); 23 int reg, int len, u32 value);
24 24
25extern void dma32_reserve_bootmem(void);
26
27#endif /* __KERNEL__ */ 25#endif /* __KERNEL__ */
28 26
29#endif /* _ASM_X86_PCI_64_H */ 27#endif /* _ASM_X86_PCI_64_H */
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 36085518badb..1a0422348d6d 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -29,6 +29,7 @@
29#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000 29#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000
30#define PCI_HAS_IO_ECS 0x40000 30#define PCI_HAS_IO_ECS 0x40000
31#define PCI_NOASSIGN_ROMS 0x80000 31#define PCI_NOASSIGN_ROMS 0x80000
32#define PCI_ROOT_NO_CRS 0x100000
32 33
33extern unsigned int pci_probe; 34extern unsigned int pci_probe;
34extern unsigned long pirq_table_addr; 35extern unsigned long pirq_table_addr;
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 0c44196b78ac..66a272dfd8b8 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -25,19 +25,18 @@
25 */ 25 */
26#ifdef CONFIG_SMP 26#ifdef CONFIG_SMP
27#define PER_CPU(var, reg) \ 27#define PER_CPU(var, reg) \
28 __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg; \ 28 __percpu_mov_op %__percpu_seg:this_cpu_off, reg; \
29 lea per_cpu__##var(reg), reg 29 lea var(reg), reg
30#define PER_CPU_VAR(var) %__percpu_seg:per_cpu__##var 30#define PER_CPU_VAR(var) %__percpu_seg:var
31#else /* ! SMP */ 31#else /* ! SMP */
32#define PER_CPU(var, reg) \ 32#define PER_CPU(var, reg) __percpu_mov_op $var, reg
33 __percpu_mov_op $per_cpu__##var, reg 33#define PER_CPU_VAR(var) var
34#define PER_CPU_VAR(var) per_cpu__##var
35#endif /* SMP */ 34#endif /* SMP */
36 35
37#ifdef CONFIG_X86_64_SMP 36#ifdef CONFIG_X86_64_SMP
38#define INIT_PER_CPU_VAR(var) init_per_cpu__##var 37#define INIT_PER_CPU_VAR(var) init_per_cpu__##var
39#else 38#else
40#define INIT_PER_CPU_VAR(var) per_cpu__##var 39#define INIT_PER_CPU_VAR(var) var
41#endif 40#endif
42 41
43#else /* ...!ASSEMBLY */ 42#else /* ...!ASSEMBLY */
@@ -60,12 +59,12 @@
60 * There also must be an entry in vmlinux_64.lds.S 59 * There also must be an entry in vmlinux_64.lds.S
61 */ 60 */
62#define DECLARE_INIT_PER_CPU(var) \ 61#define DECLARE_INIT_PER_CPU(var) \
63 extern typeof(per_cpu_var(var)) init_per_cpu_var(var) 62 extern typeof(var) init_per_cpu_var(var)
64 63
65#ifdef CONFIG_X86_64_SMP 64#ifdef CONFIG_X86_64_SMP
66#define init_per_cpu_var(var) init_per_cpu__##var 65#define init_per_cpu_var(var) init_per_cpu__##var
67#else 66#else
68#define init_per_cpu_var(var) per_cpu_var(var) 67#define init_per_cpu_var(var) var
69#endif 68#endif
70 69
71/* For arch-specific code, we can use direct single-insn ops (they 70/* For arch-specific code, we can use direct single-insn ops (they
@@ -104,6 +103,64 @@ do { \
104 } \ 103 } \
105} while (0) 104} while (0)
106 105
106/*
107 * Generate a percpu add to memory instruction and optimize code
108 * if a one is added or subtracted.
109 */
110#define percpu_add_op(var, val) \
111do { \
112 typedef typeof(var) pao_T__; \
113 const int pao_ID__ = (__builtin_constant_p(val) && \
114 ((val) == 1 || (val) == -1)) ? (val) : 0; \
115 if (0) { \
116 pao_T__ pao_tmp__; \
117 pao_tmp__ = (val); \
118 } \
119 switch (sizeof(var)) { \
120 case 1: \
121 if (pao_ID__ == 1) \
122 asm("incb "__percpu_arg(0) : "+m" (var)); \
123 else if (pao_ID__ == -1) \
124 asm("decb "__percpu_arg(0) : "+m" (var)); \
125 else \
126 asm("addb %1, "__percpu_arg(0) \
127 : "+m" (var) \
128 : "qi" ((pao_T__)(val))); \
129 break; \
130 case 2: \
131 if (pao_ID__ == 1) \
132 asm("incw "__percpu_arg(0) : "+m" (var)); \
133 else if (pao_ID__ == -1) \
134 asm("decw "__percpu_arg(0) : "+m" (var)); \
135 else \
136 asm("addw %1, "__percpu_arg(0) \
137 : "+m" (var) \
138 : "ri" ((pao_T__)(val))); \
139 break; \
140 case 4: \
141 if (pao_ID__ == 1) \
142 asm("incl "__percpu_arg(0) : "+m" (var)); \
143 else if (pao_ID__ == -1) \
144 asm("decl "__percpu_arg(0) : "+m" (var)); \
145 else \
146 asm("addl %1, "__percpu_arg(0) \
147 : "+m" (var) \
148 : "ri" ((pao_T__)(val))); \
149 break; \
150 case 8: \
151 if (pao_ID__ == 1) \
152 asm("incq "__percpu_arg(0) : "+m" (var)); \
153 else if (pao_ID__ == -1) \
154 asm("decq "__percpu_arg(0) : "+m" (var)); \
155 else \
156 asm("addq %1, "__percpu_arg(0) \
157 : "+m" (var) \
158 : "re" ((pao_T__)(val))); \
159 break; \
160 default: __bad_percpu_size(); \
161 } \
162} while (0)
163
107#define percpu_from_op(op, var, constraint) \ 164#define percpu_from_op(op, var, constraint) \
108({ \ 165({ \
109 typeof(var) pfo_ret__; \ 166 typeof(var) pfo_ret__; \
@@ -142,16 +199,14 @@ do { \
142 * per-thread variables implemented as per-cpu variables and thus 199 * per-thread variables implemented as per-cpu variables and thus
143 * stable for the duration of the respective task. 200 * stable for the duration of the respective task.
144 */ 201 */
145#define percpu_read(var) percpu_from_op("mov", per_cpu__##var, \ 202#define percpu_read(var) percpu_from_op("mov", var, "m" (var))
146 "m" (per_cpu__##var)) 203#define percpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var)))
147#define percpu_read_stable(var) percpu_from_op("mov", per_cpu__##var, \ 204#define percpu_write(var, val) percpu_to_op("mov", var, val)
148 "p" (&per_cpu__##var)) 205#define percpu_add(var, val) percpu_add_op(var, val)
149#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val) 206#define percpu_sub(var, val) percpu_add_op(var, -(val))
150#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val) 207#define percpu_and(var, val) percpu_to_op("and", var, val)
151#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val) 208#define percpu_or(var, val) percpu_to_op("or", var, val)
152#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val) 209#define percpu_xor(var, val) percpu_to_op("xor", var, val)
153#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val)
154#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val)
155 210
156#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 211#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
157#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 212#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
@@ -160,9 +215,9 @@ do { \
160#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) 215#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
161#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) 216#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
162#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) 217#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
163#define __this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) 218#define __this_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
164#define __this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) 219#define __this_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
165#define __this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) 220#define __this_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
166#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) 221#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
167#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) 222#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
168#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) 223#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
@@ -179,9 +234,9 @@ do { \
179#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) 234#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
180#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) 235#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
181#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) 236#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
182#define this_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) 237#define this_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
183#define this_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) 238#define this_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
184#define this_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) 239#define this_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
185#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) 240#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
186#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) 241#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
187#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) 242#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
@@ -192,9 +247,9 @@ do { \
192#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) 247#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
193#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) 248#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
194 249
195#define irqsafe_cpu_add_1(pcp, val) percpu_to_op("add", (pcp), val) 250#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
196#define irqsafe_cpu_add_2(pcp, val) percpu_to_op("add", (pcp), val) 251#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
197#define irqsafe_cpu_add_4(pcp, val) percpu_to_op("add", (pcp), val) 252#define irqsafe_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
198#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) 253#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
199#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) 254#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
200#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) 255#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
@@ -212,19 +267,19 @@ do { \
212#ifdef CONFIG_X86_64 267#ifdef CONFIG_X86_64
213#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 268#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
214#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) 269#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
215#define __this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) 270#define __this_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
216#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 271#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
217#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 272#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
218#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 273#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
219 274
220#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 275#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
221#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) 276#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
222#define this_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) 277#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
223#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 278#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
224#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 279#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
225#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 280#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
226 281
227#define irqsafe_cpu_add_8(pcp, val) percpu_to_op("add", (pcp), val) 282#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
228#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 283#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
229#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 284#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
230#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 285#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
@@ -236,7 +291,7 @@ do { \
236({ \ 291({ \
237 int old__; \ 292 int old__; \
238 asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \ 293 asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \
239 : "=r" (old__), "+m" (per_cpu__##var) \ 294 : "=r" (old__), "+m" (var) \
240 : "dIr" (bit)); \ 295 : "dIr" (bit)); \
241 old__; \ 296 old__; \
242}) 297})
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 1380367dabd9..befd172c82ad 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -27,7 +27,14 @@
27/* 27/*
28 * Includes eventsel and unit mask as well: 28 * Includes eventsel and unit mask as well:
29 */ 29 */
30#define ARCH_PERFMON_EVENT_MASK 0xffff 30
31
32#define INTEL_ARCH_EVTSEL_MASK 0x000000FFULL
33#define INTEL_ARCH_UNIT_MASK 0x0000FF00ULL
34#define INTEL_ARCH_EDGE_MASK 0x00040000ULL
35#define INTEL_ARCH_INV_MASK 0x00800000ULL
36#define INTEL_ARCH_CNT_MASK 0xFF000000ULL
37#define INTEL_ARCH_EVENT_MASK (INTEL_ARCH_UNIT_MASK|INTEL_ARCH_EVTSEL_MASK)
31 38
32/* 39/*
33 * filter mask to validate fixed counter events. 40 * filter mask to validate fixed counter events.
@@ -38,7 +45,12 @@
38 * The other filters are supported by fixed counters. 45 * The other filters are supported by fixed counters.
39 * The any-thread option is supported starting with v3. 46 * The any-thread option is supported starting with v3.
40 */ 47 */
41#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000 48#define INTEL_ARCH_FIXED_MASK \
49 (INTEL_ARCH_CNT_MASK| \
50 INTEL_ARCH_INV_MASK| \
51 INTEL_ARCH_EDGE_MASK|\
52 INTEL_ARCH_UNIT_MASK|\
53 INTEL_ARCH_EVTSEL_MASK)
42 54
43#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c 55#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
44#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) 56#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 0e8c2a0fd922..271de94c3810 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -23,6 +23,11 @@ static inline void paravirt_release_pud(unsigned long pfn) {}
23#endif 23#endif
24 24
25/* 25/*
26 * Flags to use when allocating a user page table page.
27 */
28extern gfp_t __userpte_alloc_gfp;
29
30/*
26 * Allocate and free page tables. 31 * Allocate and free page tables.
27 */ 32 */
28extern pgd_t *pgd_alloc(struct mm_struct *); 33extern pgd_t *pgd_alloc(struct mm_struct *);
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 01fd9461d323..47339a1ac7b6 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -54,10 +54,10 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
54 in_irq() ? KM_IRQ_PTE : \ 54 in_irq() ? KM_IRQ_PTE : \
55 KM_PTE0) 55 KM_PTE0)
56#define pte_offset_map(dir, address) \ 56#define pte_offset_map(dir, address) \
57 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \ 57 ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \
58 pte_index((address))) 58 pte_index((address)))
59#define pte_offset_map_nested(dir, address) \ 59#define pte_offset_map_nested(dir, address) \
60 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ 60 ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \
61 pte_index((address))) 61 pte_index((address)))
62#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) 62#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
63#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) 63#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
@@ -80,7 +80,7 @@ do { \
80 * The i386 doesn't have any external MMU info: the kernel page 80 * The i386 doesn't have any external MMU info: the kernel page
81 * tables contain all the necessary information. 81 * tables contain all the necessary information.
82 */ 82 */
83#define update_mmu_cache(vma, address, pte) do { } while (0) 83#define update_mmu_cache(vma, address, ptep) do { } while (0)
84 84
85#endif /* !__ASSEMBLY__ */ 85#endif /* !__ASSEMBLY__ */
86 86
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index c57a30117149..181be528c612 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -129,7 +129,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
129#define pte_unmap(pte) /* NOP */ 129#define pte_unmap(pte) /* NOP */
130#define pte_unmap_nested(pte) /* NOP */ 130#define pte_unmap_nested(pte) /* NOP */
131 131
132#define update_mmu_cache(vma, address, pte) do { } while (0) 132#define update_mmu_cache(vma, address, ptep) do { } while (0)
133 133
134/* Encode and de-code a swap entry */ 134/* Encode and de-code a swap entry */
135#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE 135#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index fc801bab1b3b..b753ea59703a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -450,6 +450,8 @@ struct thread_struct {
450 struct perf_event *ptrace_bps[HBP_NUM]; 450 struct perf_event *ptrace_bps[HBP_NUM];
451 /* Debug status used for traps, single steps, etc... */ 451 /* Debug status used for traps, single steps, etc... */
452 unsigned long debugreg6; 452 unsigned long debugreg6;
453 /* Keep track of the exact dr7 value set by the user */
454 unsigned long ptrace_dr7;
453 /* Fault info: */ 455 /* Fault info: */
454 unsigned long cr2; 456 unsigned long cr2;
455 unsigned long trap_no; 457 unsigned long trap_no;
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 4009f6534f52..6f414ed88620 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -23,14 +23,4 @@ extern int reboot_force;
23 23
24long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); 24long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
25 25
26/*
27 * This looks more complex than it should be. But we need to
28 * get the type for the ~ right in round_down (it needs to be
29 * as wide as the result!), and we want to evaluate the macro
30 * arguments just once each.
31 */
32#define __round_mask(x,y) ((__typeof__(x))((y)-1))
33#define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1)
34#define round_down(x,y) ((x) & ~__round_mask(x,y))
35
36#endif /* _ASM_X86_PROTO_H */ 26#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 9d369f680321..20102808b191 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -274,10 +274,6 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
274 return 0; 274 return 0;
275} 275}
276 276
277/* Get Nth argument at function call */
278extern unsigned long regs_get_argument_nth(struct pt_regs *regs,
279 unsigned int n);
280
281/* 277/*
282 * These are defined as per linux/ptrace.h, which see. 278 * These are defined as per linux/ptrace.h, which see.
283 */ 279 */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index ca7517d33776..606ede126972 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -41,6 +41,7 @@
41#include <linux/list.h> 41#include <linux/list.h>
42#include <linux/spinlock.h> 42#include <linux/spinlock.h>
43#include <linux/lockdep.h> 43#include <linux/lockdep.h>
44#include <asm/asm.h>
44 45
45struct rwsem_waiter; 46struct rwsem_waiter;
46 47
@@ -55,17 +56,28 @@ extern asmregparm struct rw_semaphore *
55 56
56/* 57/*
57 * the semaphore definition 58 * the semaphore definition
59 *
60 * The bias values and the counter type limits the number of
61 * potential readers/writers to 32767 for 32 bits and 2147483647
62 * for 64 bits.
58 */ 63 */
59 64
60#define RWSEM_UNLOCKED_VALUE 0x00000000 65#ifdef CONFIG_X86_64
61#define RWSEM_ACTIVE_BIAS 0x00000001 66# define RWSEM_ACTIVE_MASK 0xffffffffL
62#define RWSEM_ACTIVE_MASK 0x0000ffff 67#else
63#define RWSEM_WAITING_BIAS (-0x00010000) 68# define RWSEM_ACTIVE_MASK 0x0000ffffL
69#endif
70
71#define RWSEM_UNLOCKED_VALUE 0x00000000L
72#define RWSEM_ACTIVE_BIAS 0x00000001L
73#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1)
64#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS 74#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
65#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) 75#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
66 76
77typedef signed long rwsem_count_t;
78
67struct rw_semaphore { 79struct rw_semaphore {
68 signed long count; 80 rwsem_count_t count;
69 spinlock_t wait_lock; 81 spinlock_t wait_lock;
70 struct list_head wait_list; 82 struct list_head wait_list;
71#ifdef CONFIG_DEBUG_LOCK_ALLOC 83#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -105,7 +117,7 @@ do { \
105static inline void __down_read(struct rw_semaphore *sem) 117static inline void __down_read(struct rw_semaphore *sem)
106{ 118{
107 asm volatile("# beginning down_read\n\t" 119 asm volatile("# beginning down_read\n\t"
108 LOCK_PREFIX " incl (%%eax)\n\t" 120 LOCK_PREFIX _ASM_INC "(%1)\n\t"
109 /* adds 0x00000001, returns the old value */ 121 /* adds 0x00000001, returns the old value */
110 " jns 1f\n" 122 " jns 1f\n"
111 " call call_rwsem_down_read_failed\n" 123 " call call_rwsem_down_read_failed\n"
@@ -121,14 +133,14 @@ static inline void __down_read(struct rw_semaphore *sem)
121 */ 133 */
122static inline int __down_read_trylock(struct rw_semaphore *sem) 134static inline int __down_read_trylock(struct rw_semaphore *sem)
123{ 135{
124 __s32 result, tmp; 136 rwsem_count_t result, tmp;
125 asm volatile("# beginning __down_read_trylock\n\t" 137 asm volatile("# beginning __down_read_trylock\n\t"
126 " movl %0,%1\n\t" 138 " mov %0,%1\n\t"
127 "1:\n\t" 139 "1:\n\t"
128 " movl %1,%2\n\t" 140 " mov %1,%2\n\t"
129 " addl %3,%2\n\t" 141 " add %3,%2\n\t"
130 " jle 2f\n\t" 142 " jle 2f\n\t"
131 LOCK_PREFIX " cmpxchgl %2,%0\n\t" 143 LOCK_PREFIX " cmpxchg %2,%0\n\t"
132 " jnz 1b\n\t" 144 " jnz 1b\n\t"
133 "2:\n\t" 145 "2:\n\t"
134 "# ending __down_read_trylock\n\t" 146 "# ending __down_read_trylock\n\t"
@@ -143,13 +155,13 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
143 */ 155 */
144static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) 156static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
145{ 157{
146 int tmp; 158 rwsem_count_t tmp;
147 159
148 tmp = RWSEM_ACTIVE_WRITE_BIAS; 160 tmp = RWSEM_ACTIVE_WRITE_BIAS;
149 asm volatile("# beginning down_write\n\t" 161 asm volatile("# beginning down_write\n\t"
150 LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" 162 LOCK_PREFIX " xadd %1,(%2)\n\t"
151 /* subtract 0x0000ffff, returns the old value */ 163 /* subtract 0x0000ffff, returns the old value */
152 " testl %%edx,%%edx\n\t" 164 " test %1,%1\n\t"
153 /* was the count 0 before? */ 165 /* was the count 0 before? */
154 " jz 1f\n" 166 " jz 1f\n"
155 " call call_rwsem_down_write_failed\n" 167 " call call_rwsem_down_write_failed\n"
@@ -170,9 +182,9 @@ static inline void __down_write(struct rw_semaphore *sem)
170 */ 182 */
171static inline int __down_write_trylock(struct rw_semaphore *sem) 183static inline int __down_write_trylock(struct rw_semaphore *sem)
172{ 184{
173 signed long ret = cmpxchg(&sem->count, 185 rwsem_count_t ret = cmpxchg(&sem->count,
174 RWSEM_UNLOCKED_VALUE, 186 RWSEM_UNLOCKED_VALUE,
175 RWSEM_ACTIVE_WRITE_BIAS); 187 RWSEM_ACTIVE_WRITE_BIAS);
176 if (ret == RWSEM_UNLOCKED_VALUE) 188 if (ret == RWSEM_UNLOCKED_VALUE)
177 return 1; 189 return 1;
178 return 0; 190 return 0;
@@ -183,9 +195,9 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
183 */ 195 */
184static inline void __up_read(struct rw_semaphore *sem) 196static inline void __up_read(struct rw_semaphore *sem)
185{ 197{
186 __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; 198 rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS;
187 asm volatile("# beginning __up_read\n\t" 199 asm volatile("# beginning __up_read\n\t"
188 LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" 200 LOCK_PREFIX " xadd %1,(%2)\n\t"
189 /* subtracts 1, returns the old value */ 201 /* subtracts 1, returns the old value */
190 " jns 1f\n\t" 202 " jns 1f\n\t"
191 " call call_rwsem_wake\n" 203 " call call_rwsem_wake\n"
@@ -201,18 +213,18 @@ static inline void __up_read(struct rw_semaphore *sem)
201 */ 213 */
202static inline void __up_write(struct rw_semaphore *sem) 214static inline void __up_write(struct rw_semaphore *sem)
203{ 215{
216 rwsem_count_t tmp;
204 asm volatile("# beginning __up_write\n\t" 217 asm volatile("# beginning __up_write\n\t"
205 " movl %2,%%edx\n\t" 218 LOCK_PREFIX " xadd %1,(%2)\n\t"
206 LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t"
207 /* tries to transition 219 /* tries to transition
208 0xffff0001 -> 0x00000000 */ 220 0xffff0001 -> 0x00000000 */
209 " jz 1f\n" 221 " jz 1f\n"
210 " call call_rwsem_wake\n" 222 " call call_rwsem_wake\n"
211 "1:\n\t" 223 "1:\n\t"
212 "# ending __up_write\n" 224 "# ending __up_write\n"
213 : "+m" (sem->count) 225 : "+m" (sem->count), "=d" (tmp)
214 : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS) 226 : "a" (sem), "1" (-RWSEM_ACTIVE_WRITE_BIAS)
215 : "memory", "cc", "edx"); 227 : "memory", "cc");
216} 228}
217 229
218/* 230/*
@@ -221,33 +233,38 @@ static inline void __up_write(struct rw_semaphore *sem)
221static inline void __downgrade_write(struct rw_semaphore *sem) 233static inline void __downgrade_write(struct rw_semaphore *sem)
222{ 234{
223 asm volatile("# beginning __downgrade_write\n\t" 235 asm volatile("# beginning __downgrade_write\n\t"
224 LOCK_PREFIX " addl %2,(%%eax)\n\t" 236 LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t"
225 /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ 237 /*
238 * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386)
239 * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64)
240 */
226 " jns 1f\n\t" 241 " jns 1f\n\t"
227 " call call_rwsem_downgrade_wake\n" 242 " call call_rwsem_downgrade_wake\n"
228 "1:\n\t" 243 "1:\n\t"
229 "# ending __downgrade_write\n" 244 "# ending __downgrade_write\n"
230 : "+m" (sem->count) 245 : "+m" (sem->count)
231 : "a" (sem), "i" (-RWSEM_WAITING_BIAS) 246 : "a" (sem), "er" (-RWSEM_WAITING_BIAS)
232 : "memory", "cc"); 247 : "memory", "cc");
233} 248}
234 249
235/* 250/*
236 * implement atomic add functionality 251 * implement atomic add functionality
237 */ 252 */
238static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) 253static inline void rwsem_atomic_add(rwsem_count_t delta,
254 struct rw_semaphore *sem)
239{ 255{
240 asm volatile(LOCK_PREFIX "addl %1,%0" 256 asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0"
241 : "+m" (sem->count) 257 : "+m" (sem->count)
242 : "ir" (delta)); 258 : "er" (delta));
243} 259}
244 260
245/* 261/*
246 * implement exchange and add functionality 262 * implement exchange and add functionality
247 */ 263 */
248static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) 264static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta,
265 struct rw_semaphore *sem)
249{ 266{
250 int tmp = delta; 267 rwsem_count_t tmp = delta;
251 268
252 asm volatile(LOCK_PREFIX "xadd %0,%1" 269 asm volatile(LOCK_PREFIX "xadd %0,%1"
253 : "+r" (tmp), "+m" (sem->count) 270 : "+r" (tmp), "+m" (sem->count)
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 1e796782cd7b..4cfc90824068 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -135,6 +135,8 @@ int native_cpu_disable(void);
135void native_cpu_die(unsigned int cpu); 135void native_cpu_die(unsigned int cpu);
136void native_play_dead(void); 136void native_play_dead(void);
137void play_dead_common(void); 137void play_dead_common(void);
138void wbinvd_on_cpu(int cpu);
139int wbinvd_on_all_cpus(void);
138 140
139void native_send_call_func_ipi(const struct cpumask *mask); 141void native_send_call_func_ipi(const struct cpumask *mask);
140void native_send_call_func_single_ipi(int cpu); 142void native_send_call_func_single_ipi(int cpu);
@@ -147,6 +149,13 @@ static inline int num_booting_cpus(void)
147{ 149{
148 return cpumask_weight(cpu_callout_mask); 150 return cpumask_weight(cpu_callout_mask);
149} 151}
152#else /* !CONFIG_SMP */
153#define wbinvd_on_cpu(cpu) wbinvd()
154static inline int wbinvd_on_all_cpus(void)
155{
156 wbinvd();
157 return 0;
158}
150#endif /* CONFIG_SMP */ 159#endif /* CONFIG_SMP */
151 160
152extern unsigned disabled_cpus __cpuinitdata; 161extern unsigned disabled_cpus __cpuinitdata;
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 35e89122a42f..4dab78edbad9 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -3,8 +3,6 @@
3 3
4extern int kstack_depth_to_print; 4extern int kstack_depth_to_print;
5 5
6int x86_is_stack_id(int id, char *name);
7
8struct thread_info; 6struct thread_info;
9struct stacktrace_ops; 7struct stacktrace_ops;
10 8
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 1fecb7e61130..38638cd2fa4c 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -313,7 +313,7 @@ struct __attribute__ ((__packed__)) vmcb {
313 313
314#define SVM_EXIT_ERR -1 314#define SVM_EXIT_ERR -1
315 315
316#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ 316#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
317 317
318#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" 318#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
319#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" 319#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 8d33bc5462d1..c4a348f7bd43 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -16,6 +16,8 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/err.h> 17#include <linux/err.h>
18 18
19extern const unsigned long sys_call_table[];
20
19/* 21/*
20 * Only the low 32 bits of orig_ax are meaningful, so we return int. 22 * Only the low 32 bits of orig_ax are meaningful, so we return int.
21 * This importantly ignores the high bits on 64-bit, so comparisons 23 * This importantly ignores the high bits on 64-bit, so comparisons
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index e04740f7a0bb..b8fe48ee2ed9 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -32,7 +32,7 @@ extern void show_regs_common(void);
32 "movl %P[task_canary](%[next]), %%ebx\n\t" \ 32 "movl %P[task_canary](%[next]), %%ebx\n\t" \
33 "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" 33 "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
34#define __switch_canary_oparam \ 34#define __switch_canary_oparam \
35 , [stack_canary] "=m" (per_cpu_var(stack_canary.canary)) 35 , [stack_canary] "=m" (stack_canary.canary)
36#define __switch_canary_iparam \ 36#define __switch_canary_iparam \
37 , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) 37 , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
38#else /* CC_STACKPROTECTOR */ 38#else /* CC_STACKPROTECTOR */
@@ -114,7 +114,7 @@ do { \
114 "movq %P[task_canary](%%rsi),%%r8\n\t" \ 114 "movq %P[task_canary](%%rsi),%%r8\n\t" \
115 "movq %%r8,"__percpu_arg([gs_canary])"\n\t" 115 "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
116#define __switch_canary_oparam \ 116#define __switch_canary_oparam \
117 , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary)) 117 , [gs_canary] "=m" (irq_stack_union.stack_canary)
118#define __switch_canary_iparam \ 118#define __switch_canary_iparam \
119 , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) 119 , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
120#else /* CC_STACKPROTECTOR */ 120#else /* CC_STACKPROTECTOR */
@@ -133,7 +133,7 @@ do { \
133 __switch_canary \ 133 __switch_canary \
134 "movq %P[thread_info](%%rsi),%%r8\n\t" \ 134 "movq %P[thread_info](%%rsi),%%r8\n\t" \
135 "movq %%rax,%%rdi\n\t" \ 135 "movq %%rax,%%rdi\n\t" \
136 "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ 136 "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
137 "jnz ret_from_fork\n\t" \ 137 "jnz ret_from_fork\n\t" \
138 RESTORE_CONTEXT \ 138 RESTORE_CONTEXT \
139 : "=a" (last) \ 139 : "=a" (last) \
@@ -143,7 +143,7 @@ do { \
143 [ti_flags] "i" (offsetof(struct thread_info, flags)), \ 143 [ti_flags] "i" (offsetof(struct thread_info, flags)), \
144 [_tif_fork] "i" (_TIF_FORK), \ 144 [_tif_fork] "i" (_TIF_FORK), \
145 [thread_info] "i" (offsetof(struct task_struct, stack)), \ 145 [thread_info] "i" (offsetof(struct task_struct, stack)), \
146 [current_task] "m" (per_cpu_var(current_task)) \ 146 [current_task] "m" (current_task) \
147 __switch_canary_iparam \ 147 __switch_canary_iparam \
148 : "memory", "cc" __EXTRA_CLOBBER) 148 : "memory", "cc" __EXTRA_CLOBBER)
149#endif 149#endif
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 535e421498f6..316708d5af92 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -8,6 +8,8 @@
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/prefetch.h> 9#include <linux/prefetch.h>
10#include <linux/lockdep.h> 10#include <linux/lockdep.h>
11#include <asm/alternative.h>
12#include <asm/cpufeature.h>
11#include <asm/page.h> 13#include <asm/page.h>
12 14
13/* 15/*
@@ -16,7 +18,24 @@
16 18
17/* Handles exceptions in both to and from, but doesn't do access_ok */ 19/* Handles exceptions in both to and from, but doesn't do access_ok */
18__must_check unsigned long 20__must_check unsigned long
19copy_user_generic(void *to, const void *from, unsigned len); 21copy_user_generic_string(void *to, const void *from, unsigned len);
22__must_check unsigned long
23copy_user_generic_unrolled(void *to, const void *from, unsigned len);
24
25static __always_inline __must_check unsigned long
26copy_user_generic(void *to, const void *from, unsigned len)
27{
28 unsigned ret;
29
30 alternative_call(copy_user_generic_unrolled,
31 copy_user_generic_string,
32 X86_FEATURE_REP_GOOD,
33 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
34 "=d" (len)),
35 "1" (to), "2" (from), "3" (len)
36 : "memory", "rcx", "r8", "r9", "r10", "r11");
37 return ret;
38}
20 39
21__must_check unsigned long 40__must_check unsigned long
22_copy_to_user(void __user *to, const void *from, unsigned len); 41_copy_to_user(void __user *to, const void *from, unsigned len);
diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h
index 999873b22e7f..24532c7da3d6 100644
--- a/arch/x86/include/asm/user.h
+++ b/arch/x86/include/asm/user.h
@@ -1,5 +1,63 @@
1#ifndef _ASM_X86_USER_H
2#define _ASM_X86_USER_H
3
1#ifdef CONFIG_X86_32 4#ifdef CONFIG_X86_32
2# include "user_32.h" 5# include "user_32.h"
3#else 6#else
4# include "user_64.h" 7# include "user_64.h"
5#endif 8#endif
9
10#include <asm/types.h>
11
12struct user_ymmh_regs {
13 /* 16 * 16 bytes for each YMMH-reg */
14 __u32 ymmh_space[64];
15};
16
17struct user_xsave_hdr {
18 __u64 xstate_bv;
19 __u64 reserved1[2];
20 __u64 reserved2[5];
21};
22
23/*
24 * The structure layout of user_xstateregs, used for exporting the
25 * extended register state through ptrace and core-dump (NT_X86_XSTATE note)
26 * interfaces will be same as the memory layout of xsave used by the processor
27 * (except for the bytes 464..511, which can be used by the software) and hence
28 * the size of this structure varies depending on the features supported by the
29 * processor and OS. The size of the structure that users need to use can be
30 * obtained by doing:
31 * cpuid_count(0xd, 0, &eax, &ptrace_xstateregs_struct_size, &ecx, &edx);
32 * i.e., cpuid.(eax=0xd,ecx=0).ebx will be the size that user (debuggers, etc.)
33 * need to use.
34 *
35 * For now, only the first 8 bytes of the software usable bytes[464..471] will
36 * be used and will be set to OS enabled xstate mask (which is same as the
37 * 64bit mask returned by the xgetbv's xCR0). Users (analyzing core dump
38 * remotely, etc.) can use this mask as well as the mask saved in the
39 * xstate_hdr bytes and interpret what states the processor/OS supports
40 * and what states are in modified/initialized conditions for the
41 * particular process/thread.
42 *
43 * Also when the user modifies certain state FP/SSE/etc through the
44 * ptrace interface, they must ensure that the xsave_hdr.xstate_bv
45 * bytes[512..519] of the memory layout are updated correspondingly.
46 * i.e., for example when FP state is modified to a non-init state,
47 * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to
48 * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc.
49 */
50#define USER_XSTATE_FX_SW_WORDS 6
51#define USER_XSTATE_XCR0_WORD 0
52
53struct user_xstateregs {
54 struct {
55 __u64 fpx_space[58];
56 __u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS];
57 } i387;
58 struct user_xsave_hdr xsave_hdr;
59 struct user_ymmh_regs ymmh;
60 /* further processor state extensions go here */
61};
62
63#endif /* _ASM_X86_USER_H */
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 2751f3075d8b..71605c7d5c5c 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -18,8 +18,8 @@
18 * along with this program; if not, write to the Free Software 18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * 20 *
21 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. 21 * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
22 * Copyright (c) Russ Anderson 22 * Copyright (c) Russ Anderson <rja@sgi.com>
23 */ 23 */
24 24
25#include <linux/rtc.h> 25#include <linux/rtc.h>
@@ -36,7 +36,8 @@ enum uv_bios_cmd {
36 UV_BIOS_WATCHLIST_ALLOC, 36 UV_BIOS_WATCHLIST_ALLOC,
37 UV_BIOS_WATCHLIST_FREE, 37 UV_BIOS_WATCHLIST_FREE,
38 UV_BIOS_MEMPROTECT, 38 UV_BIOS_MEMPROTECT,
39 UV_BIOS_GET_PARTITION_ADDR 39 UV_BIOS_GET_PARTITION_ADDR,
40 UV_BIOS_SET_LEGACY_VGA_TARGET
40}; 41};
41 42
42/* 43/*
@@ -89,13 +90,14 @@ extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64);
89extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); 90extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64);
90extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); 91extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
91 92
92extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); 93extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *);
93extern s64 uv_bios_freq_base(u64, u64 *); 94extern s64 uv_bios_freq_base(u64, u64 *);
94extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int, 95extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int,
95 unsigned long *); 96 unsigned long *);
96extern int uv_bios_mq_watchlist_free(int, int); 97extern int uv_bios_mq_watchlist_free(int, int);
97extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); 98extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
98extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); 99extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *);
100extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus);
99 101
100extern void uv_bios_init(void); 102extern void uv_bios_init(void);
101 103
@@ -104,6 +106,7 @@ extern int uv_type;
104extern long sn_partition_id; 106extern long sn_partition_id;
105extern long sn_coherency_id; 107extern long sn_coherency_id;
106extern long sn_region_size; 108extern long sn_region_size;
109extern long system_serial_number;
107#define partition_coherence_id() (sn_coherency_id) 110#define partition_coherence_id() (sn_coherency_id)
108 111
109extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ 112extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index c0a01b5d985b..3bb9491b7659 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -11,6 +11,7 @@ struct mm_struct;
11extern enum uv_system_type get_uv_system_type(void); 11extern enum uv_system_type get_uv_system_type(void);
12extern int is_uv_system(void); 12extern int is_uv_system(void);
13extern void uv_cpu_init(void); 13extern void uv_cpu_init(void);
14extern void uv_nmi_init(void);
14extern void uv_system_init(void); 15extern void uv_system_init(void);
15extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, 16extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
16 struct mm_struct *mm, 17 struct mm_struct *mm,
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 40be813fefb1..14cc74ba5d23 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -329,7 +329,8 @@ static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset
329 */ 329 */
330static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset) 330static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset)
331{ 331{
332 return UV_GLOBAL_GRU_MMR_BASE | offset | (pnode << uv_hub_info->m_val); 332 return UV_GLOBAL_GRU_MMR_BASE | offset |
333 ((unsigned long)pnode << uv_hub_info->m_val);
333} 334}
334 335
335static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val) 336static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2b4945419a84..fb9a080740ec 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -53,6 +53,7 @@
53 */ 53 */
54#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 54#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
55#define SECONDARY_EXEC_ENABLE_EPT 0x00000002 55#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
56#define SECONDARY_EXEC_RDTSCP 0x00000008
56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 57#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 58#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 59#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
@@ -251,6 +252,7 @@ enum vmcs_field {
251#define EXIT_REASON_MSR_READ 31 252#define EXIT_REASON_MSR_READ 31
252#define EXIT_REASON_MSR_WRITE 32 253#define EXIT_REASON_MSR_WRITE 32
253#define EXIT_REASON_MWAIT_INSTRUCTION 36 254#define EXIT_REASON_MWAIT_INSTRUCTION 36
255#define EXIT_REASON_MONITOR_INSTRUCTION 39
254#define EXIT_REASON_PAUSE_INSTRUCTION 40 256#define EXIT_REASON_PAUSE_INSTRUCTION 40
255#define EXIT_REASON_MCE_DURING_VMENTRY 41 257#define EXIT_REASON_MCE_DURING_VMENTRY 41
256#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 258#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
@@ -362,6 +364,7 @@ enum vmcs_field {
362#define VMX_EPTP_UC_BIT (1ull << 8) 364#define VMX_EPTP_UC_BIT (1ull << 8)
363#define VMX_EPTP_WB_BIT (1ull << 14) 365#define VMX_EPTP_WB_BIT (1ull << 14)
364#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 366#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
367#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
365#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) 368#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
366#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 369#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
367#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 370#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
@@ -374,7 +377,7 @@ enum vmcs_field {
374#define VMX_EPT_READABLE_MASK 0x1ull 377#define VMX_EPT_READABLE_MASK 0x1ull
375#define VMX_EPT_WRITABLE_MASK 0x2ull 378#define VMX_EPT_WRITABLE_MASK 0x2ull
376#define VMX_EPT_EXECUTABLE_MASK 0x4ull 379#define VMX_EPT_EXECUTABLE_MASK 0x4ull
377#define VMX_EPT_IGMT_BIT (1ull << 6) 380#define VMX_EPT_IPAT_BIT (1ull << 6)
378 381
379#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 382#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
380 383
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index b1d62bbcae3d..519b54327d75 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -141,6 +141,7 @@ struct x86_cpuinit_ops {
141 * @get_wallclock: get time from HW clock like RTC etc. 141 * @get_wallclock: get time from HW clock like RTC etc.
142 * @set_wallclock: set time back to HW clock 142 * @set_wallclock: set time back to HW clock
143 * @is_untracked_pat_range exclude from PAT logic 143 * @is_untracked_pat_range exclude from PAT logic
144 * @nmi_init enable NMI on cpus
144 */ 145 */
145struct x86_platform_ops { 146struct x86_platform_ops {
146 unsigned long (*calibrate_tsc)(void); 147 unsigned long (*calibrate_tsc)(void);
@@ -148,6 +149,7 @@ struct x86_platform_ops {
148 int (*set_wallclock)(unsigned long nowtime); 149 int (*set_wallclock)(unsigned long nowtime);
149 void (*iommu_shutdown)(void); 150 void (*iommu_shutdown)(void);
150 bool (*is_untracked_pat_range)(u64 start, u64 end); 151 bool (*is_untracked_pat_range)(u64 start, u64 end);
152 void (*nmi_init)(void);
151}; 153};
152 154
153extern struct x86_init_ops x86_init; 155extern struct x86_init_ops x86_init;
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 727acc152344..ddc04ccad03b 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -27,9 +27,11 @@
27extern unsigned int xstate_size; 27extern unsigned int xstate_size;
28extern u64 pcntxt_mask; 28extern u64 pcntxt_mask;
29extern struct xsave_struct *init_xstate_buf; 29extern struct xsave_struct *init_xstate_buf;
30extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
30 31
31extern void xsave_cntxt_init(void); 32extern void xsave_cntxt_init(void);
32extern void xsave_init(void); 33extern void xsave_init(void);
34extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
33extern int init_fpu(struct task_struct *child); 35extern int init_fpu(struct task_struct *child);
34extern int check_for_xstate(struct i387_fxsave_struct __user *buf, 36extern int check_for_xstate(struct i387_fxsave_struct __user *buf,
35 void __user *fpstate, 37 void __user *fpstate,
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index db2773c6defd..a54d714545ff 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -50,6 +50,7 @@ EXPORT_SYMBOL(acpi_disabled);
50 50
51#ifdef CONFIG_X86_64 51#ifdef CONFIG_X86_64
52# include <asm/proto.h> 52# include <asm/proto.h>
53# include <asm/numa_64.h>
53#endif /* X86 */ 54#endif /* X86 */
54 55
55#define BAD_MADT_ENTRY(entry, end) ( \ 56#define BAD_MADT_ENTRY(entry, end) ( \
@@ -490,6 +491,25 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
490 */ 491 */
491#ifdef CONFIG_ACPI_HOTPLUG_CPU 492#ifdef CONFIG_ACPI_HOTPLUG_CPU
492 493
494static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
495{
496#ifdef CONFIG_ACPI_NUMA
497 int nid;
498
499 nid = acpi_get_node(handle);
500 if (nid == -1 || !node_online(nid))
501 return;
502#ifdef CONFIG_X86_64
503 apicid_to_node[physid] = nid;
504 numa_set_node(cpu, nid);
505#else /* CONFIG_X86_32 */
506 apicid_2_node[physid] = nid;
507 cpu_to_node_map[cpu] = nid;
508#endif
509
510#endif
511}
512
493static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) 513static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
494{ 514{
495 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; 515 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
@@ -548,6 +568,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
548 } 568 }
549 569
550 cpu = cpumask_first(new_map); 570 cpu = cpumask_first(new_map);
571 acpi_map_cpu2node(handle, cpu, physid);
551 572
552 *pcpu = cpu; 573 *pcpu = cpu;
553 retval = 0; 574 retval = 0;
@@ -1352,14 +1373,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1352 }, 1373 },
1353 { 1374 {
1354 .callback = force_acpi_ht, 1375 .callback = force_acpi_ht,
1355 .ident = "ASUS P2B-DS",
1356 .matches = {
1357 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1358 DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
1359 },
1360 },
1361 {
1362 .callback = force_acpi_ht,
1363 .ident = "ASUS CUR-DLS", 1376 .ident = "ASUS CUR-DLS",
1364 .matches = { 1377 .matches = {
1365 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), 1378 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index de7353c0ce9c..3a4bf35c179b 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -7,6 +7,7 @@
7#include <linux/mm.h> 7#include <linux/mm.h>
8#include <linux/vmalloc.h> 8#include <linux/vmalloc.h>
9#include <linux/memory.h> 9#include <linux/memory.h>
10#include <linux/stop_machine.h>
10#include <asm/alternative.h> 11#include <asm/alternative.h>
11#include <asm/sections.h> 12#include <asm/sections.h>
12#include <asm/pgtable.h> 13#include <asm/pgtable.h>
@@ -205,7 +206,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
205 struct alt_instr *end) 206 struct alt_instr *end)
206{ 207{
207 struct alt_instr *a; 208 struct alt_instr *a;
208 char insnbuf[MAX_PATCH_LEN]; 209 u8 insnbuf[MAX_PATCH_LEN];
209 210
210 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 211 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
211 for (a = start; a < end; a++) { 212 for (a = start; a < end; a++) {
@@ -223,6 +224,8 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
223 } 224 }
224#endif 225#endif
225 memcpy(insnbuf, a->replacement, a->replacementlen); 226 memcpy(insnbuf, a->replacement, a->replacementlen);
227 if (*insnbuf == 0xe8 && a->replacementlen == 5)
228 *(s32 *)(insnbuf + 1) += a->replacement - a->instr;
226 add_nops(insnbuf + a->replacementlen, 229 add_nops(insnbuf + a->replacementlen,
227 a->instrlen - a->replacementlen); 230 a->instrlen - a->replacementlen);
228 text_poke_early(instr, insnbuf, a->instrlen); 231 text_poke_early(instr, insnbuf, a->instrlen);
@@ -390,6 +393,24 @@ void alternatives_smp_switch(int smp)
390 mutex_unlock(&smp_alt); 393 mutex_unlock(&smp_alt);
391} 394}
392 395
396/* Return 1 if the address range is reserved for smp-alternatives */
397int alternatives_text_reserved(void *start, void *end)
398{
399 struct smp_alt_module *mod;
400 u8 **ptr;
401 u8 *text_start = start;
402 u8 *text_end = end;
403
404 list_for_each_entry(mod, &smp_alt_modules, next) {
405 if (mod->text > text_end || mod->text_end < text_start)
406 continue;
407 for (ptr = mod->locks; ptr < mod->locks_end; ptr++)
408 if (text_start <= *ptr && text_end >= *ptr)
409 return 1;
410 }
411
412 return 0;
413}
393#endif 414#endif
394 415
395#ifdef CONFIG_PARAVIRT 416#ifdef CONFIG_PARAVIRT
@@ -552,3 +573,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
552 local_irq_restore(flags); 573 local_irq_restore(flags);
553 return addr; 574 return addr;
554} 575}
576
577/*
578 * Cross-modifying kernel text with stop_machine().
579 * This code originally comes from immediate value.
580 */
581static atomic_t stop_machine_first;
582static int wrote_text;
583
584struct text_poke_params {
585 void *addr;
586 const void *opcode;
587 size_t len;
588};
589
590static int __kprobes stop_machine_text_poke(void *data)
591{
592 struct text_poke_params *tpp = data;
593
594 if (atomic_dec_and_test(&stop_machine_first)) {
595 text_poke(tpp->addr, tpp->opcode, tpp->len);
596 smp_wmb(); /* Make sure other cpus see that this has run */
597 wrote_text = 1;
598 } else {
599 while (!wrote_text)
600 cpu_relax();
601 smp_mb(); /* Load wrote_text before following execution */
602 }
603
604 flush_icache_range((unsigned long)tpp->addr,
605 (unsigned long)tpp->addr + tpp->len);
606 return 0;
607}
608
609/**
610 * text_poke_smp - Update instructions on a live kernel on SMP
611 * @addr: address to modify
612 * @opcode: source of the copy
613 * @len: length to copy
614 *
615 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
616 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
617 * should be allowed, since stop_machine() does _not_ protect code against
618 * NMI and MCE.
619 *
620 * Note: Must be called under get_online_cpus() and text_mutex.
621 */
622void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
623{
624 struct text_poke_params tpp;
625
626 tpp.addr = addr;
627 tpp.opcode = opcode;
628 tpp.len = len;
629 atomic_set(&stop_machine_first, 1);
630 wrote_text = 0;
631 stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
632 return addr;
633}
634
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 2afa27d01297..4b7099526d2c 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -69,7 +69,7 @@ static void apbt_set_mode(enum clock_event_mode mode,
69static int apbt_next_event(unsigned long delta, 69static int apbt_next_event(unsigned long delta,
70 struct clock_event_device *evt); 70 struct clock_event_device *evt);
71static cycle_t apbt_read_clocksource(struct clocksource *cs); 71static cycle_t apbt_read_clocksource(struct clocksource *cs);
72static void apbt_restart_clocksource(void); 72static void apbt_restart_clocksource(struct clocksource *cs);
73 73
74struct apbt_dev { 74struct apbt_dev {
75 struct clock_event_device evt; 75 struct clock_event_device evt;
@@ -249,7 +249,7 @@ static irqreturn_t apbt_interrupt_handler(int irq, void *data)
249 return IRQ_HANDLED; 249 return IRQ_HANDLED;
250} 250}
251 251
252static void apbt_restart_clocksource(void) 252static void apbt_restart_clocksource(struct clocksource *cs)
253{ 253{
254 apbt_start_counter(phy_cs_timer_id); 254 apbt_start_counter(phy_cs_timer_id);
255} 255}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 94f22b12858d..00187f1fcfb7 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -581,7 +581,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
581 res = (((u64)(*deltatsc)) * pm_100ms); 581 res = (((u64)(*deltatsc)) * pm_100ms);
582 do_div(res, deltapm); 582 do_div(res, deltapm);
583 apic_printk(APIC_VERBOSE, "TSC delta adjusted to " 583 apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
584 "PM-Timer: %lu (%ld) \n", 584 "PM-Timer: %lu (%ld)\n",
585 (unsigned long)res, *deltatsc); 585 (unsigned long)res, *deltatsc);
586 *deltatsc = (long)res; 586 *deltatsc = (long)res;
587 } 587 }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index b758d49b811c..e4e0ddcb1546 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1474,7 +1474,7 @@ static struct {
1474 1474
1475static void __init setup_IO_APIC_irqs(void) 1475static void __init setup_IO_APIC_irqs(void)
1476{ 1476{
1477 int apic_id = 0, pin, idx, irq; 1477 int apic_id, pin, idx, irq;
1478 int notcon = 0; 1478 int notcon = 0;
1479 struct irq_desc *desc; 1479 struct irq_desc *desc;
1480 struct irq_cfg *cfg; 1480 struct irq_cfg *cfg;
@@ -1482,14 +1482,7 @@ static void __init setup_IO_APIC_irqs(void)
1482 1482
1483 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1483 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1484 1484
1485#ifdef CONFIG_ACPI 1485 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1486 if (!acpi_disabled && acpi_ioapic) {
1487 apic_id = mp_find_ioapic(0);
1488 if (apic_id < 0)
1489 apic_id = 0;
1490 }
1491#endif
1492
1493 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1486 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1494 idx = find_irq_entry(apic_id, pin, mp_INT); 1487 idx = find_irq_entry(apic_id, pin, mp_INT);
1495 if (idx == -1) { 1488 if (idx == -1) {
@@ -1511,6 +1504,9 @@ static void __init setup_IO_APIC_irqs(void)
1511 1504
1512 irq = pin_2_irq(idx, apic_id, pin); 1505 irq = pin_2_irq(idx, apic_id, pin);
1513 1506
1507 if ((apic_id > 0) && (irq > 16))
1508 continue;
1509
1514 /* 1510 /*
1515 * Skip the timer IRQ if there's a quirk handler 1511 * Skip the timer IRQ if there's a quirk handler
1516 * installed and if it returns 1: 1512 * installed and if it returns 1:
@@ -1698,7 +1694,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1698 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1694 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1699 1695
1700 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" 1696 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
1701 " Stat Dmod Deli Vect: \n"); 1697 " Stat Dmod Deli Vect:\n");
1702 1698
1703 for (i = 0; i <= reg_01.bits.entries; i++) { 1699 for (i = 0; i <= reg_01.bits.entries; i++) {
1704 struct IO_APIC_route_entry entry; 1700 struct IO_APIC_route_entry entry;
@@ -3875,6 +3871,28 @@ void __init probe_nr_irqs_gsi(void)
3875 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); 3871 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
3876} 3872}
3877 3873
3874#ifdef CONFIG_SPARSE_IRQ
3875int __init arch_probe_nr_irqs(void)
3876{
3877 int nr;
3878
3879 if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
3880 nr_irqs = NR_VECTORS * nr_cpu_ids;
3881
3882 nr = nr_irqs_gsi + 8 * nr_cpu_ids;
3883#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
3884 /*
3885 * for MSI and HT dyn irq
3886 */
3887 nr += nr_irqs_gsi * 16;
3888#endif
3889 if (nr < nr_irqs)
3890 nr_irqs = nr;
3891
3892 return 0;
3893}
3894#endif
3895
3878static int __io_apic_set_pci_routing(struct device *dev, int irq, 3896static int __io_apic_set_pci_routing(struct device *dev, int irq,
3879 struct io_apic_irq_attr *irq_attr) 3897 struct io_apic_irq_attr *irq_attr)
3880{ 3898{
@@ -4082,27 +4100,23 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4082#ifdef CONFIG_SMP 4100#ifdef CONFIG_SMP
4083void __init setup_ioapic_dest(void) 4101void __init setup_ioapic_dest(void)
4084{ 4102{
4085 int pin, ioapic = 0, irq, irq_entry; 4103 int pin, ioapic, irq, irq_entry;
4086 struct irq_desc *desc; 4104 struct irq_desc *desc;
4087 const struct cpumask *mask; 4105 const struct cpumask *mask;
4088 4106
4089 if (skip_ioapic_setup == 1) 4107 if (skip_ioapic_setup == 1)
4090 return; 4108 return;
4091 4109
4092#ifdef CONFIG_ACPI 4110 for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
4093 if (!acpi_disabled && acpi_ioapic) {
4094 ioapic = mp_find_ioapic(0);
4095 if (ioapic < 0)
4096 ioapic = 0;
4097 }
4098#endif
4099
4100 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { 4111 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4101 irq_entry = find_irq_entry(ioapic, pin, mp_INT); 4112 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4102 if (irq_entry == -1) 4113 if (irq_entry == -1)
4103 continue; 4114 continue;
4104 irq = pin_2_irq(irq_entry, ioapic, pin); 4115 irq = pin_2_irq(irq_entry, ioapic, pin);
4105 4116
4117 if ((ioapic > 0) && (irq > 16))
4118 continue;
4119
4106 desc = irq_to_desc(irq); 4120 desc = irq_to_desc(irq);
4107 4121
4108 /* 4122 /*
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index f72b5f0f388e..8aa65adbd25d 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -438,8 +438,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
438 * Ayiee, looks like this CPU is stuck ... 438 * Ayiee, looks like this CPU is stuck ...
439 * wait a few IRQs (5 seconds) before doing the oops ... 439 * wait a few IRQs (5 seconds) before doing the oops ...
440 */ 440 */
441 __this_cpu_inc(per_cpu_var(alert_counter)); 441 __this_cpu_inc(alert_counter);
442 if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz) 442 if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
443 /* 443 /*
444 * die_nmi will return ONLY if NOTIFY_STOP happens.. 444 * die_nmi will return ONLY if NOTIFY_STOP happens..
445 */ 445 */
@@ -447,7 +447,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
447 regs, panic_on_timeout); 447 regs, panic_on_timeout);
448 } else { 448 } else {
449 __get_cpu_var(last_irq_sum) = sum; 449 __get_cpu_var(last_irq_sum) = sum;
450 __this_cpu_write(per_cpu_var(alert_counter), 0); 450 __this_cpu_write(alert_counter, 0);
451 } 451 }
452 452
453 /* see if the nmi watchdog went off */ 453 /* see if the nmi watchdog went off */
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index be5c4fd47778..3e28401f161c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -225,7 +225,7 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
225 225
226 mpc_record = 0; 226 mpc_record = 0;
227 printk(KERN_INFO 227 printk(KERN_INFO
228 "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); 228 "Found an OEM MPC table at %8p - parsing it...\n", oemtable);
229 229
230 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { 230 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
231 printk(KERN_WARNING 231 printk(KERN_WARNING
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 21db3cbea7dc..3740c8a4eae7 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * SGI UV APIC functions (note: not an Intel compatible APIC) 6 * SGI UV APIC functions (note: not an Intel compatible APIC)
7 * 7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10#include <linux/cpumask.h> 10#include <linux/cpumask.h>
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
@@ -20,6 +20,8 @@
20#include <linux/cpu.h> 20#include <linux/cpu.h>
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/io.h> 22#include <linux/io.h>
23#include <linux/pci.h>
24#include <linux/kdebug.h>
23 25
24#include <asm/uv/uv_mmrs.h> 26#include <asm/uv/uv_mmrs.h>
25#include <asm/uv/uv_hub.h> 27#include <asm/uv/uv_hub.h>
@@ -34,10 +36,13 @@
34 36
35DEFINE_PER_CPU(int, x2apic_extra_bits); 37DEFINE_PER_CPU(int, x2apic_extra_bits);
36 38
39#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args)
40
37static enum uv_system_type uv_system_type; 41static enum uv_system_type uv_system_type;
38static u64 gru_start_paddr, gru_end_paddr; 42static u64 gru_start_paddr, gru_end_paddr;
39int uv_min_hub_revision_id; 43int uv_min_hub_revision_id;
40EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); 44EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
45static DEFINE_SPINLOCK(uv_nmi_lock);
41 46
42static inline bool is_GRU_range(u64 start, u64 end) 47static inline bool is_GRU_range(u64 start, u64 end)
43{ 48{
@@ -71,6 +76,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
71 if (!strcmp(oem_id, "SGI")) { 76 if (!strcmp(oem_id, "SGI")) {
72 nodeid = early_get_nodeid(); 77 nodeid = early_get_nodeid();
73 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; 78 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
79 x86_platform.nmi_init = uv_nmi_init;
74 if (!strcmp(oem_table_id, "UVL")) 80 if (!strcmp(oem_table_id, "UVL"))
75 uv_system_type = UV_LEGACY_APIC; 81 uv_system_type = UV_LEGACY_APIC;
76 else if (!strcmp(oem_table_id, "UVX")) 82 else if (!strcmp(oem_table_id, "UVX"))
@@ -482,7 +488,7 @@ static void uv_heartbeat(unsigned long ignored)
482 488
483static void __cpuinit uv_heartbeat_enable(int cpu) 489static void __cpuinit uv_heartbeat_enable(int cpu)
484{ 490{
485 if (!uv_cpu_hub_info(cpu)->scir.enabled) { 491 while (!uv_cpu_hub_info(cpu)->scir.enabled) {
486 struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; 492 struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
487 493
488 uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); 494 uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
@@ -490,11 +496,10 @@ static void __cpuinit uv_heartbeat_enable(int cpu)
490 timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; 496 timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
491 add_timer_on(timer, cpu); 497 add_timer_on(timer, cpu);
492 uv_cpu_hub_info(cpu)->scir.enabled = 1; 498 uv_cpu_hub_info(cpu)->scir.enabled = 1;
493 }
494 499
495 /* check boot cpu */ 500 /* also ensure that boot cpu is enabled */
496 if (!uv_cpu_hub_info(0)->scir.enabled) 501 cpu = 0;
497 uv_heartbeat_enable(0); 502 }
498} 503}
499 504
500#ifdef CONFIG_HOTPLUG_CPU 505#ifdef CONFIG_HOTPLUG_CPU
@@ -553,6 +558,30 @@ late_initcall(uv_init_heartbeat);
553 558
554#endif /* !CONFIG_HOTPLUG_CPU */ 559#endif /* !CONFIG_HOTPLUG_CPU */
555 560
561/* Direct Legacy VGA I/O traffic to designated IOH */
562int uv_set_vga_state(struct pci_dev *pdev, bool decode,
563 unsigned int command_bits, bool change_bridge)
564{
565 int domain, bus, rc;
566
567 PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n",
568 pdev->devfn, decode, command_bits, change_bridge);
569
570 if (!change_bridge)
571 return 0;
572
573 if ((command_bits & PCI_COMMAND_IO) == 0)
574 return 0;
575
576 domain = pci_domain_nr(pdev->bus);
577 bus = pdev->bus->number;
578
579 rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
580 PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc);
581
582 return rc;
583}
584
556/* 585/*
557 * Called on each cpu to initialize the per_cpu UV data area. 586 * Called on each cpu to initialize the per_cpu UV data area.
558 * FIXME: hotplug not supported yet 587 * FIXME: hotplug not supported yet
@@ -569,6 +598,46 @@ void __cpuinit uv_cpu_init(void)
569 set_x2apic_extra_bits(uv_hub_info->pnode); 598 set_x2apic_extra_bits(uv_hub_info->pnode);
570} 599}
571 600
601/*
602 * When NMI is received, print a stack trace.
603 */
604int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
605{
606 if (reason != DIE_NMI_IPI)
607 return NOTIFY_OK;
608 /*
609 * Use a lock so only one cpu prints at a time
610 * to prevent intermixed output.
611 */
612 spin_lock(&uv_nmi_lock);
613 pr_info("NMI stack dump cpu %u:\n", smp_processor_id());
614 dump_stack();
615 spin_unlock(&uv_nmi_lock);
616
617 return NOTIFY_STOP;
618}
619
620static struct notifier_block uv_dump_stack_nmi_nb = {
621 .notifier_call = uv_handle_nmi
622};
623
624void uv_register_nmi_notifier(void)
625{
626 if (register_die_notifier(&uv_dump_stack_nmi_nb))
627 printk(KERN_WARNING "UV NMI handler failed to register\n");
628}
629
630void uv_nmi_init(void)
631{
632 unsigned int value;
633
634 /*
635 * Unmask NMI on all cpus
636 */
637 value = apic_read(APIC_LVT1) | APIC_DM_NMI;
638 value &= ~APIC_LVT_MASKED;
639 apic_write(APIC_LVT1, value);
640}
572 641
573void __init uv_system_init(void) 642void __init uv_system_init(void)
574{ 643{
@@ -634,8 +703,8 @@ void __init uv_system_init(void)
634 } 703 }
635 704
636 uv_bios_init(); 705 uv_bios_init();
637 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, 706 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id,
638 &sn_coherency_id, &sn_region_size); 707 &sn_region_size, &system_serial_number);
639 uv_rtc_init(); 708 uv_rtc_init();
640 709
641 for_each_present_cpu(cpu) { 710 for_each_present_cpu(cpu) {
@@ -690,5 +759,9 @@ void __init uv_system_init(void)
690 759
691 uv_cpu_init(); 760 uv_cpu_init();
692 uv_scir_register_cpu_notifier(); 761 uv_scir_register_cpu_notifier();
762 uv_register_nmi_notifier();
693 proc_mkdir("sgi_uv", NULL); 763 proc_mkdir("sgi_uv", NULL);
764
765 /* register Legacy VGA I/O redirection handler */
766 pci_register_set_vga_state(uv_set_vga_state);
694} 767}
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index b5b6b23bce53..031aa887b0eb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1992,8 +1992,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
1992 apm_info.disabled = 1; 1992 apm_info.disabled = 1;
1993 printk(KERN_INFO "%s machine detected. " 1993 printk(KERN_INFO "%s machine detected. "
1994 "Disabling APM.\n", d->ident); 1994 "Disabling APM.\n", d->ident);
1995 printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); 1995 printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n");
1996 printk(KERN_INFO "download from support.intel.com \n"); 1996 printk(KERN_INFO "download from support.intel.com\n");
1997 } 1997 }
1998 return 0; 1998 return 0;
1999} 1999}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index b0206a211b09..8bc57baaa9ad 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -15,8 +15,8 @@
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 * 17 *
18 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. 18 * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Russ Anderson 19 * Copyright (c) Russ Anderson <rja@sgi.com>
20 */ 20 */
21 21
22#include <linux/efi.h> 22#include <linux/efi.h>
@@ -30,6 +30,7 @@ static struct uv_systab uv_systab;
30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) 30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
31{ 31{
32 struct uv_systab *tab = &uv_systab; 32 struct uv_systab *tab = &uv_systab;
33 s64 ret;
33 34
34 if (!tab->function) 35 if (!tab->function)
35 /* 36 /*
@@ -37,9 +38,11 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
37 */ 38 */
38 return BIOS_STATUS_UNIMPLEMENTED; 39 return BIOS_STATUS_UNIMPLEMENTED;
39 40
40 return efi_call6((void *)__va(tab->function), 41 ret = efi_call6((void *)__va(tab->function), (u64)which,
41 (u64)which, a1, a2, a3, a4, a5); 42 a1, a2, a3, a4, a5);
43 return ret;
42} 44}
45EXPORT_SYMBOL_GPL(uv_bios_call);
43 46
44s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, 47s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
45 u64 a4, u64 a5) 48 u64 a4, u64 a5)
@@ -73,11 +76,14 @@ long sn_coherency_id;
73EXPORT_SYMBOL_GPL(sn_coherency_id); 76EXPORT_SYMBOL_GPL(sn_coherency_id);
74long sn_region_size; 77long sn_region_size;
75EXPORT_SYMBOL_GPL(sn_region_size); 78EXPORT_SYMBOL_GPL(sn_region_size);
79long system_serial_number;
80EXPORT_SYMBOL_GPL(system_serial_number);
76int uv_type; 81int uv_type;
82EXPORT_SYMBOL_GPL(uv_type);
77 83
78 84
79s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, 85s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
80 long *region) 86 long *region, long *ssn)
81{ 87{
82 s64 ret; 88 s64 ret;
83 u64 v0, v1; 89 u64 v0, v1;
@@ -97,8 +103,11 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
97 *coher = part.coherence_id; 103 *coher = part.coherence_id;
98 if (region) 104 if (region)
99 *region = part.region_size; 105 *region = part.region_size;
106 if (ssn)
107 *ssn = v1;
100 return ret; 108 return ret;
101} 109}
110EXPORT_SYMBOL_GPL(uv_bios_get_sn_info);
102 111
103int 112int
104uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size, 113uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
@@ -154,6 +163,25 @@ s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
154} 163}
155EXPORT_SYMBOL_GPL(uv_bios_freq_base); 164EXPORT_SYMBOL_GPL(uv_bios_freq_base);
156 165
166/*
167 * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target
168 * @decode: true to enable target, false to disable target
169 * @domain: PCI domain number
170 * @bus: PCI bus number
171 *
172 * Returns:
173 * 0: Success
174 * -EINVAL: Invalid domain or bus number
175 * -ENOSYS: Capability not available
176 * -EBUSY: Legacy VGA I/O cannot be retargeted at this time
177 */
178int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
179{
180 return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET,
181 (u64)decode, (u64)domain, (u64)bus, 0, 0);
182}
183EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
184
157 185
158#ifdef CONFIG_EFI 186#ifdef CONFIG_EFI
159void uv_bios_init(void) 187void uv_bios_init(void)
@@ -185,4 +213,3 @@ void uv_bios_init(void)
185 213
186void uv_bios_init(void) { } 214void uv_bios_init(void) { }
187#endif 215#endif
188
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 468489b57aae..97ad79cdf688 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -32,6 +32,10 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { 32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, 33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, 34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
35 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
36 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
37 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
38 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
35 { 0, 0, 0, 0 } 39 { 0, 0, 0, 0 }
36 }; 40 };
37 41
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index f138c6c389b9..870e6cc6ad28 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -10,6 +10,20 @@ if CPU_FREQ
10 10
11comment "CPUFreq processor drivers" 11comment "CPUFreq processor drivers"
12 12
13config X86_PCC_CPUFREQ
14 tristate "Processor Clocking Control interface driver"
15 depends on ACPI && ACPI_PROCESSOR
16 help
17 This driver adds support for the PCC interface.
18
19 For details, take a look at:
20 <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
21
22 To compile this driver as a module, choose M here: the
23 module will be called pcc-cpufreq.
24
25 If in doubt, say N.
26
13config X86_ACPI_CPUFREQ 27config X86_ACPI_CPUFREQ
14 tristate "ACPI Processor P-States driver" 28 tristate "ACPI Processor P-States driver"
15 select CPU_FREQ_TABLE 29 select CPU_FREQ_TABLE
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 509296df294d..1840c0a5170b 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -4,6 +4,7 @@
4 4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o 5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o 6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
7obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o 8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
8obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
9obj-$(CONFIG_X86_LONGHAUL) += longhaul.o 10obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
new file mode 100644
index 000000000000..ff36d2979a90
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -0,0 +1,620 @@
1/*
2 * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
3 *
4 * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
5 * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
6 * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; version 2 of the License.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
17 * INFRINGEMENT. See the GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26#include <linux/kernel.h>
27#include <linux/module.h>
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/sched.h>
31#include <linux/cpufreq.h>
32#include <linux/compiler.h>
33
34#include <linux/acpi.h>
35#include <linux/io.h>
36#include <linux/spinlock.h>
37#include <linux/uaccess.h>
38
39#include <acpi/processor.h>
40
41#define PCC_VERSION "1.00.00"
42#define POLL_LOOPS 300
43
44#define CMD_COMPLETE 0x1
45#define CMD_GET_FREQ 0x0
46#define CMD_SET_FREQ 0x1
47
48#define BUF_SZ 4
49
50#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
51 "pcc-cpufreq", msg)
52
53struct pcc_register_resource {
54 u8 descriptor;
55 u16 length;
56 u8 space_id;
57 u8 bit_width;
58 u8 bit_offset;
59 u8 access_size;
60 u64 address;
61} __attribute__ ((packed));
62
63struct pcc_memory_resource {
64 u8 descriptor;
65 u16 length;
66 u8 space_id;
67 u8 resource_usage;
68 u8 type_specific;
69 u64 granularity;
70 u64 minimum;
71 u64 maximum;
72 u64 translation_offset;
73 u64 address_length;
74} __attribute__ ((packed));
75
76static struct cpufreq_driver pcc_cpufreq_driver;
77
78struct pcc_header {
79 u32 signature;
80 u16 length;
81 u8 major;
82 u8 minor;
83 u32 features;
84 u16 command;
85 u16 status;
86 u32 latency;
87 u32 minimum_time;
88 u32 maximum_time;
89 u32 nominal;
90 u32 throttled_frequency;
91 u32 minimum_frequency;
92};
93
94static void __iomem *pcch_virt_addr;
95static struct pcc_header __iomem *pcch_hdr;
96
97static DEFINE_SPINLOCK(pcc_lock);
98
99static struct acpi_generic_address doorbell;
100
101static u64 doorbell_preserve;
102static u64 doorbell_write;
103
104static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
105 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
106
107struct pcc_cpu {
108 u32 input_offset;
109 u32 output_offset;
110};
111
112static struct pcc_cpu *pcc_cpu_info;
113
114static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
115{
116 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
117 policy->cpuinfo.max_freq);
118 return 0;
119}
120
121static inline void pcc_cmd(void)
122{
123 u64 doorbell_value;
124 int i;
125
126 acpi_read(&doorbell_value, &doorbell);
127 acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
128 &doorbell);
129
130 for (i = 0; i < POLL_LOOPS; i++) {
131 if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
132 break;
133 }
134}
135
136static inline void pcc_clear_mapping(void)
137{
138 if (pcch_virt_addr)
139 iounmap(pcch_virt_addr);
140 pcch_virt_addr = NULL;
141}
142
143static unsigned int pcc_get_freq(unsigned int cpu)
144{
145 struct pcc_cpu *pcc_cpu_data;
146 unsigned int curr_freq;
147 unsigned int freq_limit;
148 u16 status;
149 u32 input_buffer;
150 u32 output_buffer;
151
152 spin_lock(&pcc_lock);
153
154 dprintk("get: get_freq for CPU %d\n", cpu);
155 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
156
157 input_buffer = 0x1;
158 iowrite32(input_buffer,
159 (pcch_virt_addr + pcc_cpu_data->input_offset));
160 iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
161
162 pcc_cmd();
163
164 output_buffer =
165 ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
166
167 /* Clear the input buffer - we are done with the current command */
168 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
169
170 status = ioread16(&pcch_hdr->status);
171 if (status != CMD_COMPLETE) {
172 dprintk("get: FAILED: for CPU %d, status is %d\n",
173 cpu, status);
174 goto cmd_incomplete;
175 }
176 iowrite16(0, &pcch_hdr->status);
177 curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
178 / 100) * 1000);
179
180 dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
181 "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
182 cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
183 output_buffer, curr_freq);
184
185 freq_limit = (output_buffer >> 8) & 0xff;
186 if (freq_limit != 0xff) {
187 dprintk("get: frequency for cpu %d is being temporarily"
188 " capped at %d\n", cpu, curr_freq);
189 }
190
191 spin_unlock(&pcc_lock);
192 return curr_freq;
193
194cmd_incomplete:
195 iowrite16(0, &pcch_hdr->status);
196 spin_unlock(&pcc_lock);
197 return -EINVAL;
198}
199
200static int pcc_cpufreq_target(struct cpufreq_policy *policy,
201 unsigned int target_freq,
202 unsigned int relation)
203{
204 struct pcc_cpu *pcc_cpu_data;
205 struct cpufreq_freqs freqs;
206 u16 status;
207 u32 input_buffer;
208 int cpu;
209
210 spin_lock(&pcc_lock);
211 cpu = policy->cpu;
212 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
213
214 dprintk("target: CPU %d should go to target freq: %d "
215 "(virtual) input_offset is 0x%x\n",
216 cpu, target_freq,
217 (pcch_virt_addr + pcc_cpu_data->input_offset));
218
219 freqs.new = target_freq;
220 freqs.cpu = cpu;
221 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
222
223 input_buffer = 0x1 | (((target_freq * 100)
224 / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
225 iowrite32(input_buffer,
226 (pcch_virt_addr + pcc_cpu_data->input_offset));
227 iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
228
229 pcc_cmd();
230
231 /* Clear the input buffer - we are done with the current command */
232 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
233
234 status = ioread16(&pcch_hdr->status);
235 if (status != CMD_COMPLETE) {
236 dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
237 cpu, status);
238 goto cmd_incomplete;
239 }
240 iowrite16(0, &pcch_hdr->status);
241
242 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
243 dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
244 spin_unlock(&pcc_lock);
245
246 return 0;
247
248cmd_incomplete:
249 iowrite16(0, &pcch_hdr->status);
250 spin_unlock(&pcc_lock);
251 return -EINVAL;
252}
253
254static int pcc_get_offset(int cpu)
255{
256 acpi_status status;
257 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
258 union acpi_object *pccp, *offset;
259 struct pcc_cpu *pcc_cpu_data;
260 struct acpi_processor *pr;
261 int ret = 0;
262
263 pr = per_cpu(processors, cpu);
264 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
265
266 status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
267 if (ACPI_FAILURE(status))
268 return -ENODEV;
269
270 pccp = buffer.pointer;
271 if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
272 ret = -ENODEV;
273 goto out_free;
274 };
275
276 offset = &(pccp->package.elements[0]);
277 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
278 ret = -ENODEV;
279 goto out_free;
280 }
281
282 pcc_cpu_data->input_offset = offset->integer.value;
283
284 offset = &(pccp->package.elements[1]);
285 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
286 ret = -ENODEV;
287 goto out_free;
288 }
289
290 pcc_cpu_data->output_offset = offset->integer.value;
291
292 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
293 memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
294
295 dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
296 "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
297 cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
298out_free:
299 kfree(buffer.pointer);
300 return ret;
301}
302
303static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
304{
305 acpi_status status;
306 struct acpi_object_list input;
307 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
308 union acpi_object in_params[4];
309 union acpi_object *out_obj;
310 u32 capabilities[2];
311 u32 errors;
312 u32 supported;
313 int ret = 0;
314
315 input.count = 4;
316 input.pointer = in_params;
317 input.count = 4;
318 input.pointer = in_params;
319 in_params[0].type = ACPI_TYPE_BUFFER;
320 in_params[0].buffer.length = 16;
321 in_params[0].buffer.pointer = OSC_UUID;
322 in_params[1].type = ACPI_TYPE_INTEGER;
323 in_params[1].integer.value = 1;
324 in_params[2].type = ACPI_TYPE_INTEGER;
325 in_params[2].integer.value = 2;
326 in_params[3].type = ACPI_TYPE_BUFFER;
327 in_params[3].buffer.length = 8;
328 in_params[3].buffer.pointer = (u8 *)&capabilities;
329
330 capabilities[0] = OSC_QUERY_ENABLE;
331 capabilities[1] = 0x1;
332
333 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
334 if (ACPI_FAILURE(status))
335 return -ENODEV;
336
337 if (!output.length)
338 return -ENODEV;
339
340 out_obj = output.pointer;
341 if (out_obj->type != ACPI_TYPE_BUFFER) {
342 ret = -ENODEV;
343 goto out_free;
344 }
345
346 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
347 if (errors) {
348 ret = -ENODEV;
349 goto out_free;
350 }
351
352 supported = *((u32 *)(out_obj->buffer.pointer + 4));
353 if (!(supported & 0x1)) {
354 ret = -ENODEV;
355 goto out_free;
356 }
357
358 kfree(output.pointer);
359 capabilities[0] = 0x0;
360 capabilities[1] = 0x1;
361
362 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
363 if (ACPI_FAILURE(status))
364 return -ENODEV;
365
366 if (!output.length)
367 return -ENODEV;
368
369 out_obj = output.pointer;
370 if (out_obj->type != ACPI_TYPE_BUFFER) {
371 ret = -ENODEV;
372 goto out_free;
373 }
374
375 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
376 if (errors) {
377 ret = -ENODEV;
378 goto out_free;
379 }
380
381 supported = *((u32 *)(out_obj->buffer.pointer + 4));
382 if (!(supported & 0x1)) {
383 ret = -ENODEV;
384 goto out_free;
385 }
386
387out_free:
388 kfree(output.pointer);
389 return ret;
390}
391
392static int __init pcc_cpufreq_probe(void)
393{
394 acpi_status status;
395 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
396 struct pcc_memory_resource *mem_resource;
397 struct pcc_register_resource *reg_resource;
398 union acpi_object *out_obj, *member;
399 acpi_handle handle, osc_handle;
400 int ret = 0;
401
402 status = acpi_get_handle(NULL, "\\_SB", &handle);
403 if (ACPI_FAILURE(status))
404 return -ENODEV;
405
406 status = acpi_get_handle(handle, "_OSC", &osc_handle);
407 if (ACPI_SUCCESS(status)) {
408 ret = pcc_cpufreq_do_osc(&osc_handle);
409 if (ret)
410 dprintk("probe: _OSC evaluation did not succeed\n");
411 /* Firmware's use of _OSC is optional */
412 ret = 0;
413 }
414
415 status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
416 if (ACPI_FAILURE(status))
417 return -ENODEV;
418
419 out_obj = output.pointer;
420 if (out_obj->type != ACPI_TYPE_PACKAGE) {
421 ret = -ENODEV;
422 goto out_free;
423 }
424
425 member = &out_obj->package.elements[0];
426 if (member->type != ACPI_TYPE_BUFFER) {
427 ret = -ENODEV;
428 goto out_free;
429 }
430
431 mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
432
433 dprintk("probe: mem_resource descriptor: 0x%x,"
434 " length: %d, space_id: %d, resource_usage: %d,"
435 " type_specific: %d, granularity: 0x%llx,"
436 " minimum: 0x%llx, maximum: 0x%llx,"
437 " translation_offset: 0x%llx, address_length: 0x%llx\n",
438 mem_resource->descriptor, mem_resource->length,
439 mem_resource->space_id, mem_resource->resource_usage,
440 mem_resource->type_specific, mem_resource->granularity,
441 mem_resource->minimum, mem_resource->maximum,
442 mem_resource->translation_offset,
443 mem_resource->address_length);
444
445 if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
446 ret = -ENODEV;
447 goto out_free;
448 }
449
450 pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
451 mem_resource->address_length);
452 if (pcch_virt_addr == NULL) {
453 dprintk("probe: could not map shared mem region\n");
454 goto out_free;
455 }
456 pcch_hdr = pcch_virt_addr;
457
458 dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
459 dprintk("probe: PCCH header is at physical address: 0x%llx,"
460 " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
461 " supported features: 0x%x, command field: 0x%x,"
462 " status field: 0x%x, nominal latency: %d us\n",
463 mem_resource->minimum, ioread32(&pcch_hdr->signature),
464 ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
465 ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
466 ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
467 ioread32(&pcch_hdr->latency));
468
469 dprintk("probe: min time between commands: %d us,"
470 " max time between commands: %d us,"
471 " nominal CPU frequency: %d MHz,"
472 " minimum CPU frequency: %d MHz,"
473 " minimum CPU frequency without throttling: %d MHz\n",
474 ioread32(&pcch_hdr->minimum_time),
475 ioread32(&pcch_hdr->maximum_time),
476 ioread32(&pcch_hdr->nominal),
477 ioread32(&pcch_hdr->throttled_frequency),
478 ioread32(&pcch_hdr->minimum_frequency));
479
480 member = &out_obj->package.elements[1];
481 if (member->type != ACPI_TYPE_BUFFER) {
482 ret = -ENODEV;
483 goto pcch_free;
484 }
485
486 reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
487
488 doorbell.space_id = reg_resource->space_id;
489 doorbell.bit_width = reg_resource->bit_width;
490 doorbell.bit_offset = reg_resource->bit_offset;
491 doorbell.access_width = 64;
492 doorbell.address = reg_resource->address;
493
494 dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
495 "bit_offset is %d, access_width is %d, address is 0x%llx\n",
496 doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
497 doorbell.access_width, reg_resource->address);
498
499 member = &out_obj->package.elements[2];
500 if (member->type != ACPI_TYPE_INTEGER) {
501 ret = -ENODEV;
502 goto pcch_free;
503 }
504
505 doorbell_preserve = member->integer.value;
506
507 member = &out_obj->package.elements[3];
508 if (member->type != ACPI_TYPE_INTEGER) {
509 ret = -ENODEV;
510 goto pcch_free;
511 }
512
513 doorbell_write = member->integer.value;
514
515 dprintk("probe: doorbell_preserve: 0x%llx,"
516 " doorbell_write: 0x%llx\n",
517 doorbell_preserve, doorbell_write);
518
519 pcc_cpu_info = alloc_percpu(struct pcc_cpu);
520 if (!pcc_cpu_info) {
521 ret = -ENOMEM;
522 goto pcch_free;
523 }
524
525 printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
526 " limits: %d MHz, %d MHz\n", PCC_VERSION,
527 ioread32(&pcch_hdr->minimum_frequency),
528 ioread32(&pcch_hdr->nominal));
529 kfree(output.pointer);
530 return ret;
531pcch_free:
532 pcc_clear_mapping();
533out_free:
534 kfree(output.pointer);
535 return ret;
536}
537
538static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
539{
540 unsigned int cpu = policy->cpu;
541 unsigned int result = 0;
542
543 if (!pcch_virt_addr) {
544 result = -1;
545 goto pcch_null;
546 }
547
548 result = pcc_get_offset(cpu);
549 if (result) {
550 dprintk("init: PCCP evaluation failed\n");
551 goto free;
552 }
553
554 policy->max = policy->cpuinfo.max_freq =
555 ioread32(&pcch_hdr->nominal) * 1000;
556 policy->min = policy->cpuinfo.min_freq =
557 ioread32(&pcch_hdr->minimum_frequency) * 1000;
558 policy->cur = pcc_get_freq(cpu);
559
560 dprintk("init: policy->max is %d, policy->min is %d\n",
561 policy->max, policy->min);
562
563 return 0;
564free:
565 pcc_clear_mapping();
566 free_percpu(pcc_cpu_info);
567pcch_null:
568 return result;
569}
570
571static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
572{
573 return 0;
574}
575
576static struct cpufreq_driver pcc_cpufreq_driver = {
577 .flags = CPUFREQ_CONST_LOOPS,
578 .get = pcc_get_freq,
579 .verify = pcc_cpufreq_verify,
580 .target = pcc_cpufreq_target,
581 .init = pcc_cpufreq_cpu_init,
582 .exit = pcc_cpufreq_cpu_exit,
583 .name = "pcc-cpufreq",
584 .owner = THIS_MODULE,
585};
586
587static int __init pcc_cpufreq_init(void)
588{
589 int ret;
590
591 if (acpi_disabled)
592 return 0;
593
594 ret = pcc_cpufreq_probe();
595 if (ret) {
596 dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
597 return ret;
598 }
599
600 ret = cpufreq_register_driver(&pcc_cpufreq_driver);
601
602 return ret;
603}
604
605static void __exit pcc_cpufreq_exit(void)
606{
607 cpufreq_unregister_driver(&pcc_cpufreq_driver);
608
609 pcc_clear_mapping();
610
611 free_percpu(pcc_cpu_info);
612}
613
614MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
615MODULE_VERSION(PCC_VERSION);
616MODULE_DESCRIPTION("Processor Clocking Control interface driver");
617MODULE_LICENSE("GPL");
618
619late_initcall(pcc_cpufreq_init);
620module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 6e44519960c8..d360b56e9825 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -806,7 +806,7 @@ static int find_psb_table(struct powernow_k8_data *data)
806static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, 806static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
807 unsigned int index) 807 unsigned int index)
808{ 808{
809 acpi_integer control; 809 u64 control;
810 810
811 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) 811 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
812 return; 812 return;
@@ -824,7 +824,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
824{ 824{
825 struct cpufreq_frequency_table *powernow_table; 825 struct cpufreq_frequency_table *powernow_table;
826 int ret_val = -ENODEV; 826 int ret_val = -ENODEV;
827 acpi_integer control, status; 827 u64 control, status;
828 828
829 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 829 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
830 dprintk("register performance failed: bad ACPI data\n"); 830 dprintk("register performance failed: bad ACPI data\n");
@@ -948,7 +948,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
948 u32 fid; 948 u32 fid;
949 u32 vid; 949 u32 vid;
950 u32 freq, index; 950 u32 freq, index;
951 acpi_integer status, control; 951 u64 status, control;
952 952
953 if (data->exttype) { 953 if (data->exttype) {
954 status = data->acpi_data.states[i].status; 954 status = data->acpi_data.states[i].status;
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index fc6c8ef92dcc..eddb1bdd1b8f 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -18,6 +18,7 @@
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <asm/k8.h> 20#include <asm/k8.h>
21#include <asm/smp.h>
21 22
22#define LVL_1_INST 1 23#define LVL_1_INST 1
23#define LVL_1_DATA 2 24#define LVL_1_DATA 2
@@ -31,6 +32,8 @@ struct _cache_table {
31 short size; 32 short size;
32}; 33};
33 34
35#define MB(x) ((x) * 1024)
36
34/* All the cache descriptor types we care about (no TLB or 37/* All the cache descriptor types we care about (no TLB or
35 trace cache entries) */ 38 trace cache entries) */
36 39
@@ -44,9 +47,9 @@ static const struct _cache_table __cpuinitconst cache_table[] =
44 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ 47 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
45 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ 48 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
46 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 49 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
47 { 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 50 { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
48 { 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 51 { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */
49 { 0x29, LVL_3, 4096 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 52 { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */
50 { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ 53 { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */
51 { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ 54 { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */
52 { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 55 { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */
@@ -59,16 +62,16 @@ static const struct _cache_table __cpuinitconst cache_table[] =
59 { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ 62 { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */
60 { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ 63 { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */
61 { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ 64 { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */
62 { 0x44, LVL_2, 1024 }, /* 4-way set assoc, 32 byte line size */ 65 { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */
63 { 0x45, LVL_2, 2048 }, /* 4-way set assoc, 32 byte line size */ 66 { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
64 { 0x46, LVL_3, 4096 }, /* 4-way set assoc, 64 byte line size */ 67 { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
65 { 0x47, LVL_3, 8192 }, /* 8-way set assoc, 64 byte line size */ 68 { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
66 { 0x49, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ 69 { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
67 { 0x4a, LVL_3, 6144 }, /* 12-way set assoc, 64 byte line size */ 70 { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
68 { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 71 { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
69 { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */ 72 { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */
70 { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */ 73 { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */
71 { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */ 74 { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */
72 { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 75 { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
73 { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 76 { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
74 { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 77 { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
@@ -77,34 +80,34 @@ static const struct _cache_table __cpuinitconst cache_table[] =
77 { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ 80 { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
78 { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ 81 { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
79 { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ 82 { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */
80 { 0x78, LVL_2, 1024 }, /* 4-way set assoc, 64 byte line size */ 83 { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */
81 { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 84 { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
82 { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 85 { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
83 { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 86 { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */
84 { 0x7c, LVL_2, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 87 { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
85 { 0x7d, LVL_2, 2048 }, /* 8-way set assoc, 64 byte line size */ 88 { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
86 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ 89 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
87 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ 90 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
88 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ 91 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
89 { 0x84, LVL_2, 1024 }, /* 8-way set assoc, 32 byte line size */ 92 { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
90 { 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */ 93 { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */
91 { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ 94 { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
92 { 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */ 95 { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */
93 { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ 96 { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */
94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ 97 { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */
95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ 98 { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */
96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ 99 { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */
97 { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */ 100 { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */
98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 101 { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ 102 { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */
100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 103 { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
101 { 0xde, LVL_3, 8192 }, /* 12-way set assoc, 64 byte line size */ 104 { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */
102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ 105 { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */
103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ 106 { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 107 { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
105 { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */ 108 { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */
106 { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */ 109 { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */
107 { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */ 110 { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */
108 { 0x00, 0, 0} 111 { 0x00, 0, 0}
109}; 112};
110 113
@@ -150,7 +153,8 @@ struct _cpuid4_info {
150 union _cpuid4_leaf_ebx ebx; 153 union _cpuid4_leaf_ebx ebx;
151 union _cpuid4_leaf_ecx ecx; 154 union _cpuid4_leaf_ecx ecx;
152 unsigned long size; 155 unsigned long size;
153 unsigned long can_disable; 156 bool can_disable;
157 unsigned int l3_indices;
154 DECLARE_BITMAP(shared_cpu_map, NR_CPUS); 158 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
155}; 159};
156 160
@@ -160,7 +164,8 @@ struct _cpuid4_info_regs {
160 union _cpuid4_leaf_ebx ebx; 164 union _cpuid4_leaf_ebx ebx;
161 union _cpuid4_leaf_ecx ecx; 165 union _cpuid4_leaf_ecx ecx;
162 unsigned long size; 166 unsigned long size;
163 unsigned long can_disable; 167 bool can_disable;
168 unsigned int l3_indices;
164}; 169};
165 170
166unsigned short num_cache_leaves; 171unsigned short num_cache_leaves;
@@ -290,6 +295,36 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
290 (ebx->split.ways_of_associativity + 1) - 1; 295 (ebx->split.ways_of_associativity + 1) - 1;
291} 296}
292 297
298struct _cache_attr {
299 struct attribute attr;
300 ssize_t (*show)(struct _cpuid4_info *, char *);
301 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
302};
303
304#ifdef CONFIG_CPU_SUP_AMD
305static unsigned int __cpuinit amd_calc_l3_indices(void)
306{
307 /*
308 * We're called over smp_call_function_single() and therefore
309 * are on the correct cpu.
310 */
311 int cpu = smp_processor_id();
312 int node = cpu_to_node(cpu);
313 struct pci_dev *dev = node_to_k8_nb_misc(node);
314 unsigned int sc0, sc1, sc2, sc3;
315 u32 val = 0;
316
317 pci_read_config_dword(dev, 0x1C4, &val);
318
319 /* calculate subcache sizes */
320 sc0 = !(val & BIT(0));
321 sc1 = !(val & BIT(4));
322 sc2 = !(val & BIT(8)) + !(val & BIT(9));
323 sc3 = !(val & BIT(12)) + !(val & BIT(13));
324
325 return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
326}
327
293static void __cpuinit 328static void __cpuinit
294amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 329amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
295{ 330{
@@ -299,12 +334,103 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
299 if (boot_cpu_data.x86 == 0x11) 334 if (boot_cpu_data.x86 == 0x11)
300 return; 335 return;
301 336
302 /* see erratum #382 */ 337 /* see errata #382 and #388 */
303 if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) 338 if ((boot_cpu_data.x86 == 0x10) &&
339 ((boot_cpu_data.x86_model < 0x8) ||
340 (boot_cpu_data.x86_mask < 0x1)))
304 return; 341 return;
305 342
306 this_leaf->can_disable = 1; 343 this_leaf->can_disable = true;
344 this_leaf->l3_indices = amd_calc_l3_indices();
345}
346
347static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
348 unsigned int index)
349{
350 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
351 int node = amd_get_nb_id(cpu);
352 struct pci_dev *dev = node_to_k8_nb_misc(node);
353 unsigned int reg = 0;
354
355 if (!this_leaf->can_disable)
356 return -EINVAL;
357
358 if (!dev)
359 return -EINVAL;
360
361 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
362 return sprintf(buf, "0x%08x\n", reg);
363}
364
365#define SHOW_CACHE_DISABLE(index) \
366static ssize_t \
367show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
368{ \
369 return show_cache_disable(this_leaf, buf, index); \
307} 370}
371SHOW_CACHE_DISABLE(0)
372SHOW_CACHE_DISABLE(1)
373
374static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
375 const char *buf, size_t count, unsigned int index)
376{
377 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
378 int node = amd_get_nb_id(cpu);
379 struct pci_dev *dev = node_to_k8_nb_misc(node);
380 unsigned long val = 0;
381
382#define SUBCACHE_MASK (3UL << 20)
383#define SUBCACHE_INDEX 0xfff
384
385 if (!this_leaf->can_disable)
386 return -EINVAL;
387
388 if (!capable(CAP_SYS_ADMIN))
389 return -EPERM;
390
391 if (!dev)
392 return -EINVAL;
393
394 if (strict_strtoul(buf, 10, &val) < 0)
395 return -EINVAL;
396
397 /* do not allow writes outside of allowed bits */
398 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
399 ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
400 return -EINVAL;
401
402 val |= BIT(30);
403 pci_write_config_dword(dev, 0x1BC + index * 4, val);
404 /*
405 * We need to WBINVD on a core on the node containing the L3 cache which
406 * indices we disable therefore a simple wbinvd() is not sufficient.
407 */
408 wbinvd_on_cpu(cpu);
409 pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
410 return count;
411}
412
413#define STORE_CACHE_DISABLE(index) \
414static ssize_t \
415store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
416 const char *buf, size_t count) \
417{ \
418 return store_cache_disable(this_leaf, buf, count, index); \
419}
420STORE_CACHE_DISABLE(0)
421STORE_CACHE_DISABLE(1)
422
423static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
424 show_cache_disable_0, store_cache_disable_0);
425static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
426 show_cache_disable_1, store_cache_disable_1);
427
428#else /* CONFIG_CPU_SUP_AMD */
429static void __cpuinit
430amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
431{
432};
433#endif /* CONFIG_CPU_SUP_AMD */
308 434
309static int 435static int
310__cpuinit cpuid4_cache_lookup_regs(int index, 436__cpuinit cpuid4_cache_lookup_regs(int index,
@@ -711,82 +837,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
711#define to_object(k) container_of(k, struct _index_kobject, kobj) 837#define to_object(k) container_of(k, struct _index_kobject, kobj)
712#define to_attr(a) container_of(a, struct _cache_attr, attr) 838#define to_attr(a) container_of(a, struct _cache_attr, attr)
713 839
714static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
715 unsigned int index)
716{
717 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
718 int node = cpu_to_node(cpu);
719 struct pci_dev *dev = node_to_k8_nb_misc(node);
720 unsigned int reg = 0;
721
722 if (!this_leaf->can_disable)
723 return -EINVAL;
724
725 if (!dev)
726 return -EINVAL;
727
728 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
729 return sprintf(buf, "%x\n", reg);
730}
731
732#define SHOW_CACHE_DISABLE(index) \
733static ssize_t \
734show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
735{ \
736 return show_cache_disable(this_leaf, buf, index); \
737}
738SHOW_CACHE_DISABLE(0)
739SHOW_CACHE_DISABLE(1)
740
741static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
742 const char *buf, size_t count, unsigned int index)
743{
744 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
745 int node = cpu_to_node(cpu);
746 struct pci_dev *dev = node_to_k8_nb_misc(node);
747 unsigned long val = 0;
748 unsigned int scrubber = 0;
749
750 if (!this_leaf->can_disable)
751 return -EINVAL;
752
753 if (!capable(CAP_SYS_ADMIN))
754 return -EPERM;
755
756 if (!dev)
757 return -EINVAL;
758
759 if (strict_strtoul(buf, 10, &val) < 0)
760 return -EINVAL;
761
762 val |= 0xc0000000;
763
764 pci_read_config_dword(dev, 0x58, &scrubber);
765 scrubber &= ~0x1f000000;
766 pci_write_config_dword(dev, 0x58, scrubber);
767
768 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
769 wbinvd();
770 pci_write_config_dword(dev, 0x1BC + index * 4, val);
771 return count;
772}
773
774#define STORE_CACHE_DISABLE(index) \
775static ssize_t \
776store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
777 const char *buf, size_t count) \
778{ \
779 return store_cache_disable(this_leaf, buf, count, index); \
780}
781STORE_CACHE_DISABLE(0)
782STORE_CACHE_DISABLE(1)
783
784struct _cache_attr {
785 struct attribute attr;
786 ssize_t (*show)(struct _cpuid4_info *, char *);
787 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
788};
789
790#define define_one_ro(_name) \ 840#define define_one_ro(_name) \
791static struct _cache_attr _name = \ 841static struct _cache_attr _name = \
792 __ATTR(_name, 0444, show_##_name, NULL) 842 __ATTR(_name, 0444, show_##_name, NULL)
@@ -801,23 +851,28 @@ define_one_ro(size);
801define_one_ro(shared_cpu_map); 851define_one_ro(shared_cpu_map);
802define_one_ro(shared_cpu_list); 852define_one_ro(shared_cpu_list);
803 853
804static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, 854#define DEFAULT_SYSFS_CACHE_ATTRS \
805 show_cache_disable_0, store_cache_disable_0); 855 &type.attr, \
806static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, 856 &level.attr, \
807 show_cache_disable_1, store_cache_disable_1); 857 &coherency_line_size.attr, \
858 &physical_line_partition.attr, \
859 &ways_of_associativity.attr, \
860 &number_of_sets.attr, \
861 &size.attr, \
862 &shared_cpu_map.attr, \
863 &shared_cpu_list.attr
808 864
809static struct attribute *default_attrs[] = { 865static struct attribute *default_attrs[] = {
810 &type.attr, 866 DEFAULT_SYSFS_CACHE_ATTRS,
811 &level.attr, 867 NULL
812 &coherency_line_size.attr, 868};
813 &physical_line_partition.attr, 869
814 &ways_of_associativity.attr, 870static struct attribute *default_l3_attrs[] = {
815 &number_of_sets.attr, 871 DEFAULT_SYSFS_CACHE_ATTRS,
816 &size.attr, 872#ifdef CONFIG_CPU_SUP_AMD
817 &shared_cpu_map.attr,
818 &shared_cpu_list.attr,
819 &cache_disable_0.attr, 873 &cache_disable_0.attr,
820 &cache_disable_1.attr, 874 &cache_disable_1.attr,
875#endif
821 NULL 876 NULL
822}; 877};
823 878
@@ -908,6 +963,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
908 unsigned int cpu = sys_dev->id; 963 unsigned int cpu = sys_dev->id;
909 unsigned long i, j; 964 unsigned long i, j;
910 struct _index_kobject *this_object; 965 struct _index_kobject *this_object;
966 struct _cpuid4_info *this_leaf;
911 int retval; 967 int retval;
912 968
913 retval = cpuid4_cache_sysfs_init(cpu); 969 retval = cpuid4_cache_sysfs_init(cpu);
@@ -926,6 +982,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
926 this_object = INDEX_KOBJECT_PTR(cpu, i); 982 this_object = INDEX_KOBJECT_PTR(cpu, i);
927 this_object->cpu = cpu; 983 this_object->cpu = cpu;
928 this_object->index = i; 984 this_object->index = i;
985
986 this_leaf = CPUID4_INFO_IDX(cpu, i);
987
988 if (this_leaf->can_disable)
989 ktype_cache.default_attrs = default_l3_attrs;
990 else
991 ktype_cache.default_attrs = default_attrs;
992
929 retval = kobject_init_and_add(&(this_object->kobj), 993 retval = kobject_init_and_add(&(this_object->kobj),
930 &ktype_cache, 994 &ktype_cache,
931 per_cpu(ici_cache_kobject, cpu), 995 per_cpu(ici_cache_kobject, cpu),
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index f4361b56f8e9..ad9e5ed81181 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
1obj-y := main.o if.o generic.o state.o cleanup.o 1obj-y := main.o if.o generic.o cleanup.o
2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3 3
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 33af14110dfd..92ba9cd31c9a 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -108,7 +108,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
108 return 0; 108 return 0;
109} 109}
110 110
111static struct mtrr_ops amd_mtrr_ops = { 111static const struct mtrr_ops amd_mtrr_ops = {
112 .vendor = X86_VENDOR_AMD, 112 .vendor = X86_VENDOR_AMD,
113 .set = amd_set_mtrr, 113 .set = amd_set_mtrr,
114 .get = amd_get_mtrr, 114 .get = amd_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index de89f14eff3a..316fe3e60a97 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -110,7 +110,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
110 return 0; 110 return 0;
111} 111}
112 112
113static struct mtrr_ops centaur_mtrr_ops = { 113static const struct mtrr_ops centaur_mtrr_ops = {
114 .vendor = X86_VENDOR_CENTAUR, 114 .vendor = X86_VENDOR_CENTAUR,
115 .set = centaur_set_mcr, 115 .set = centaur_set_mcr,
116 .get = centaur_get_mcr, 116 .get = centaur_get_mcr,
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 09b1698e0466..06130b52f012 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -22,10 +22,10 @@
22#include <linux/pci.h> 22#include <linux/pci.h>
23#include <linux/smp.h> 23#include <linux/smp.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/sort.h>
26#include <linux/mutex.h> 25#include <linux/mutex.h>
27#include <linux/uaccess.h> 26#include <linux/uaccess.h>
28#include <linux/kvm_para.h> 27#include <linux/kvm_para.h>
28#include <linux/range.h>
29 29
30#include <asm/processor.h> 30#include <asm/processor.h>
31#include <asm/e820.h> 31#include <asm/e820.h>
@@ -34,11 +34,6 @@
34 34
35#include "mtrr.h" 35#include "mtrr.h"
36 36
37struct res_range {
38 unsigned long start;
39 unsigned long end;
40};
41
42struct var_mtrr_range_state { 37struct var_mtrr_range_state {
43 unsigned long base_pfn; 38 unsigned long base_pfn;
44 unsigned long size_pfn; 39 unsigned long size_pfn;
@@ -56,7 +51,7 @@ struct var_mtrr_state {
56/* Should be related to MTRR_VAR_RANGES nums */ 51/* Should be related to MTRR_VAR_RANGES nums */
57#define RANGE_NUM 256 52#define RANGE_NUM 256
58 53
59static struct res_range __initdata range[RANGE_NUM]; 54static struct range __initdata range[RANGE_NUM];
60static int __initdata nr_range; 55static int __initdata nr_range;
61 56
62static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; 57static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
@@ -64,152 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
64static int __initdata debug_print; 59static int __initdata debug_print;
65#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) 60#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
66 61
67
68static int __init
69add_range(struct res_range *range, int nr_range,
70 unsigned long start, unsigned long end)
71{
72 /* Out of slots: */
73 if (nr_range >= RANGE_NUM)
74 return nr_range;
75
76 range[nr_range].start = start;
77 range[nr_range].end = end;
78
79 nr_range++;
80
81 return nr_range;
82}
83
84static int __init
85add_range_with_merge(struct res_range *range, int nr_range,
86 unsigned long start, unsigned long end)
87{
88 int i;
89
90 /* Try to merge it with old one: */
91 for (i = 0; i < nr_range; i++) {
92 unsigned long final_start, final_end;
93 unsigned long common_start, common_end;
94
95 if (!range[i].end)
96 continue;
97
98 common_start = max(range[i].start, start);
99 common_end = min(range[i].end, end);
100 if (common_start > common_end + 1)
101 continue;
102
103 final_start = min(range[i].start, start);
104 final_end = max(range[i].end, end);
105
106 range[i].start = final_start;
107 range[i].end = final_end;
108 return nr_range;
109 }
110
111 /* Need to add it: */
112 return add_range(range, nr_range, start, end);
113}
114
115static void __init
116subtract_range(struct res_range *range, unsigned long start, unsigned long end)
117{
118 int i, j;
119
120 for (j = 0; j < RANGE_NUM; j++) {
121 if (!range[j].end)
122 continue;
123
124 if (start <= range[j].start && end >= range[j].end) {
125 range[j].start = 0;
126 range[j].end = 0;
127 continue;
128 }
129
130 if (start <= range[j].start && end < range[j].end &&
131 range[j].start < end + 1) {
132 range[j].start = end + 1;
133 continue;
134 }
135
136
137 if (start > range[j].start && end >= range[j].end &&
138 range[j].end > start - 1) {
139 range[j].end = start - 1;
140 continue;
141 }
142
143 if (start > range[j].start && end < range[j].end) {
144 /* Find the new spare: */
145 for (i = 0; i < RANGE_NUM; i++) {
146 if (range[i].end == 0)
147 break;
148 }
149 if (i < RANGE_NUM) {
150 range[i].end = range[j].end;
151 range[i].start = end + 1;
152 } else {
153 printk(KERN_ERR "run of slot in ranges\n");
154 }
155 range[j].end = start - 1;
156 continue;
157 }
158 }
159}
160
161static int __init cmp_range(const void *x1, const void *x2)
162{
163 const struct res_range *r1 = x1;
164 const struct res_range *r2 = x2;
165 long start1, start2;
166
167 start1 = r1->start;
168 start2 = r2->start;
169
170 return start1 - start2;
171}
172
173static int __init clean_sort_range(struct res_range *range, int az)
174{
175 int i, j, k = az - 1, nr_range = 0;
176
177 for (i = 0; i < k; i++) {
178 if (range[i].end)
179 continue;
180 for (j = k; j > i; j--) {
181 if (range[j].end) {
182 k = j;
183 break;
184 }
185 }
186 if (j == i)
187 break;
188 range[i].start = range[k].start;
189 range[i].end = range[k].end;
190 range[k].start = 0;
191 range[k].end = 0;
192 k--;
193 }
194 /* count it */
195 for (i = 0; i < az; i++) {
196 if (!range[i].end) {
197 nr_range = i;
198 break;
199 }
200 }
201
202 /* sort them */
203 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
204
205 return nr_range;
206}
207
208#define BIOS_BUG_MSG KERN_WARNING \ 62#define BIOS_BUG_MSG KERN_WARNING \
209 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" 63 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
210 64
211static int __init 65static int __init
212x86_get_mtrr_mem_range(struct res_range *range, int nr_range, 66x86_get_mtrr_mem_range(struct range *range, int nr_range,
213 unsigned long extra_remove_base, 67 unsigned long extra_remove_base,
214 unsigned long extra_remove_size) 68 unsigned long extra_remove_size)
215{ 69{
@@ -223,14 +77,14 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
223 continue; 77 continue;
224 base = range_state[i].base_pfn; 78 base = range_state[i].base_pfn;
225 size = range_state[i].size_pfn; 79 size = range_state[i].size_pfn;
226 nr_range = add_range_with_merge(range, nr_range, base, 80 nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
227 base + size - 1); 81 base, base + size);
228 } 82 }
229 if (debug_print) { 83 if (debug_print) {
230 printk(KERN_DEBUG "After WB checking\n"); 84 printk(KERN_DEBUG "After WB checking\n");
231 for (i = 0; i < nr_range; i++) 85 for (i = 0; i < nr_range; i++)
232 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 86 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
233 range[i].start, range[i].end + 1); 87 range[i].start, range[i].end);
234 } 88 }
235 89
236 /* Take out UC ranges: */ 90 /* Take out UC ranges: */
@@ -252,19 +106,19 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
252 size -= (1<<(20-PAGE_SHIFT)) - base; 106 size -= (1<<(20-PAGE_SHIFT)) - base;
253 base = 1<<(20-PAGE_SHIFT); 107 base = 1<<(20-PAGE_SHIFT);
254 } 108 }
255 subtract_range(range, base, base + size - 1); 109 subtract_range(range, RANGE_NUM, base, base + size);
256 } 110 }
257 if (extra_remove_size) 111 if (extra_remove_size)
258 subtract_range(range, extra_remove_base, 112 subtract_range(range, RANGE_NUM, extra_remove_base,
259 extra_remove_base + extra_remove_size - 1); 113 extra_remove_base + extra_remove_size);
260 114
261 if (debug_print) { 115 if (debug_print) {
262 printk(KERN_DEBUG "After UC checking\n"); 116 printk(KERN_DEBUG "After UC checking\n");
263 for (i = 0; i < RANGE_NUM; i++) { 117 for (i = 0; i < RANGE_NUM; i++) {
264 if (!range[i].end) 118 if (!range[i].end)
265 continue; 119 continue;
266 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 120 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
267 range[i].start, range[i].end + 1); 121 range[i].start, range[i].end);
268 } 122 }
269 } 123 }
270 124
@@ -273,26 +127,22 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
273 if (debug_print) { 127 if (debug_print) {
274 printk(KERN_DEBUG "After sorting\n"); 128 printk(KERN_DEBUG "After sorting\n");
275 for (i = 0; i < nr_range; i++) 129 for (i = 0; i < nr_range; i++)
276 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 130 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
277 range[i].start, range[i].end + 1); 131 range[i].start, range[i].end);
278 } 132 }
279 133
280 /* clear those is not used */
281 for (i = nr_range; i < RANGE_NUM; i++)
282 memset(&range[i], 0, sizeof(range[i]));
283
284 return nr_range; 134 return nr_range;
285} 135}
286 136
287#ifdef CONFIG_MTRR_SANITIZER 137#ifdef CONFIG_MTRR_SANITIZER
288 138
289static unsigned long __init sum_ranges(struct res_range *range, int nr_range) 139static unsigned long __init sum_ranges(struct range *range, int nr_range)
290{ 140{
291 unsigned long sum = 0; 141 unsigned long sum = 0;
292 int i; 142 int i;
293 143
294 for (i = 0; i < nr_range; i++) 144 for (i = 0; i < nr_range; i++)
295 sum += range[i].end + 1 - range[i].start; 145 sum += range[i].end - range[i].start;
296 146
297 return sum; 147 return sum;
298} 148}
@@ -621,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg)
621early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); 471early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
622 472
623static int __init 473static int __init
624x86_setup_var_mtrrs(struct res_range *range, int nr_range, 474x86_setup_var_mtrrs(struct range *range, int nr_range,
625 u64 chunk_size, u64 gran_size) 475 u64 chunk_size, u64 gran_size)
626{ 476{
627 struct var_mtrr_state var_state; 477 struct var_mtrr_state var_state;
@@ -639,7 +489,7 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
639 /* Write the range: */ 489 /* Write the range: */
640 for (i = 0; i < nr_range; i++) { 490 for (i = 0; i < nr_range; i++) {
641 set_var_mtrr_range(&var_state, range[i].start, 491 set_var_mtrr_range(&var_state, range[i].start,
642 range[i].end - range[i].start + 1); 492 range[i].end - range[i].start);
643 } 493 }
644 494
645 /* Write the last range: */ 495 /* Write the last range: */
@@ -742,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
742 unsigned long x_remove_base, 592 unsigned long x_remove_base,
743 unsigned long x_remove_size, int i) 593 unsigned long x_remove_size, int i)
744{ 594{
745 static struct res_range range_new[RANGE_NUM]; 595 static struct range range_new[RANGE_NUM];
746 unsigned long range_sums_new; 596 unsigned long range_sums_new;
747 static int nr_range_new; 597 static int nr_range_new;
748 int num_reg; 598 int num_reg;
@@ -869,10 +719,10 @@ int __init mtrr_cleanup(unsigned address_bits)
869 * [0, 1M) should always be covered by var mtrr with WB 719 * [0, 1M) should always be covered by var mtrr with WB
870 * and fixed mtrrs should take effect before var mtrr for it: 720 * and fixed mtrrs should take effect before var mtrr for it:
871 */ 721 */
872 nr_range = add_range_with_merge(range, nr_range, 0, 722 nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0,
873 (1ULL<<(20 - PAGE_SHIFT)) - 1); 723 1ULL<<(20 - PAGE_SHIFT));
874 /* Sort the ranges: */ 724 /* Sort the ranges: */
875 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 725 sort_range(range, nr_range);
876 726
877 range_sums = sum_ranges(range, nr_range); 727 range_sums = sum_ranges(range, nr_range);
878 printk(KERN_INFO "total RAM covered: %ldM\n", 728 printk(KERN_INFO "total RAM covered: %ldM\n",
@@ -1089,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1089 nr_range = 0; 939 nr_range = 0;
1090 if (mtrr_tom2) { 940 if (mtrr_tom2) {
1091 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); 941 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1092 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; 942 range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
1093 if (highest_pfn < range[nr_range].end + 1) 943 if (highest_pfn < range[nr_range].end)
1094 highest_pfn = range[nr_range].end + 1; 944 highest_pfn = range[nr_range].end;
1095 nr_range++; 945 nr_range++;
1096 } 946 }
1097 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); 947 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
@@ -1103,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1103 953
1104 /* Check the holes: */ 954 /* Check the holes: */
1105 for (i = 0; i < nr_range - 1; i++) { 955 for (i = 0; i < nr_range - 1; i++) {
1106 if (range[i].end + 1 < range[i+1].start) 956 if (range[i].end < range[i+1].start)
1107 total_trim_size += real_trim_memory(range[i].end + 1, 957 total_trim_size += real_trim_memory(range[i].end,
1108 range[i+1].start); 958 range[i+1].start);
1109 } 959 }
1110 960
1111 /* Check the top: */ 961 /* Check the top: */
1112 i = nr_range - 1; 962 i = nr_range - 1;
1113 if (range[i].end + 1 < end_pfn) 963 if (range[i].end < end_pfn)
1114 total_trim_size += real_trim_memory(range[i].end + 1, 964 total_trim_size += real_trim_memory(range[i].end,
1115 end_pfn); 965 end_pfn);
1116 966
1117 if (total_trim_size) { 967 if (total_trim_size) {
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 228d982ce09c..68a3343e5798 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -265,7 +265,7 @@ static void cyrix_set_all(void)
265 post_set(); 265 post_set();
266} 266}
267 267
268static struct mtrr_ops cyrix_mtrr_ops = { 268static const struct mtrr_ops cyrix_mtrr_ops = {
269 .vendor = X86_VENDOR_CYRIX, 269 .vendor = X86_VENDOR_CYRIX,
270 .set_all = cyrix_set_all, 270 .set_all = cyrix_set_all,
271 .set = cyrix_set_arr, 271 .set = cyrix_set_arr,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 55da0c5f68dd..9aa5dc76ff4a 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -464,7 +464,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
464 tmp |= ~((1<<(hi - 1)) - 1); 464 tmp |= ~((1<<(hi - 1)) - 1);
465 465
466 if (tmp != mask_lo) { 466 if (tmp != mask_lo) {
467 WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); 467 printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
468 mask_lo = tmp; 468 mask_lo = tmp;
469 } 469 }
470 } 470 }
@@ -570,7 +570,7 @@ static unsigned long set_mtrr_state(void)
570 570
571 571
572static unsigned long cr4; 572static unsigned long cr4;
573static DEFINE_SPINLOCK(set_atomicity_lock); 573static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
574 574
575/* 575/*
576 * Since we are disabling the cache don't allow any interrupts, 576 * Since we are disabling the cache don't allow any interrupts,
@@ -590,7 +590,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
590 * changes to the way the kernel boots 590 * changes to the way the kernel boots
591 */ 591 */
592 592
593 spin_lock(&set_atomicity_lock); 593 raw_spin_lock(&set_atomicity_lock);
594 594
595 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ 595 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
596 cr0 = read_cr0() | X86_CR0_CD; 596 cr0 = read_cr0() | X86_CR0_CD;
@@ -627,7 +627,7 @@ static void post_set(void) __releases(set_atomicity_lock)
627 /* Restore value of CR4 */ 627 /* Restore value of CR4 */
628 if (cpu_has_pge) 628 if (cpu_has_pge)
629 write_cr4(cr4); 629 write_cr4(cr4);
630 spin_unlock(&set_atomicity_lock); 630 raw_spin_unlock(&set_atomicity_lock);
631} 631}
632 632
633static void generic_set_all(void) 633static void generic_set_all(void)
@@ -752,7 +752,7 @@ int positive_have_wrcomb(void)
752/* 752/*
753 * Generic structure... 753 * Generic structure...
754 */ 754 */
755struct mtrr_ops generic_mtrr_ops = { 755const struct mtrr_ops generic_mtrr_ops = {
756 .use_intel_if = 1, 756 .use_intel_if = 1,
757 .set_all = generic_set_all, 757 .set_all = generic_set_all,
758 .get = generic_get_mtrr, 758 .get = generic_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 84e83de54575..79556bd9b602 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -60,14 +60,14 @@ static DEFINE_MUTEX(mtrr_mutex);
60u64 size_or_mask, size_and_mask; 60u64 size_or_mask, size_and_mask;
61static bool mtrr_aps_delayed_init; 61static bool mtrr_aps_delayed_init;
62 62
63static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; 63static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
64 64
65struct mtrr_ops *mtrr_if; 65const struct mtrr_ops *mtrr_if;
66 66
67static void set_mtrr(unsigned int reg, unsigned long base, 67static void set_mtrr(unsigned int reg, unsigned long base,
68 unsigned long size, mtrr_type type); 68 unsigned long size, mtrr_type type);
69 69
70void set_mtrr_ops(struct mtrr_ops *ops) 70void set_mtrr_ops(const struct mtrr_ops *ops)
71{ 71{
72 if (ops->vendor && ops->vendor < X86_VENDOR_NUM) 72 if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
73 mtrr_ops[ops->vendor] = ops; 73 mtrr_ops[ops->vendor] = ops;
@@ -145,6 +145,7 @@ struct set_mtrr_data {
145 145
146/** 146/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs. 147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
148 * @info: pointer to mtrr configuration data
148 * 149 *
149 * Returns nothing. 150 * Returns nothing.
150 */ 151 */
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index a501dee9a87a..df5e41f31a27 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -32,7 +32,7 @@ extern int generic_get_free_region(unsigned long base, unsigned long size,
32extern int generic_validate_add_page(unsigned long base, unsigned long size, 32extern int generic_validate_add_page(unsigned long base, unsigned long size,
33 unsigned int type); 33 unsigned int type);
34 34
35extern struct mtrr_ops generic_mtrr_ops; 35extern const struct mtrr_ops generic_mtrr_ops;
36 36
37extern int positive_have_wrcomb(void); 37extern int positive_have_wrcomb(void);
38 38
@@ -53,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index,
53 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); 53 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
54void get_mtrr_state(void); 54void get_mtrr_state(void);
55 55
56extern void set_mtrr_ops(struct mtrr_ops *ops); 56extern void set_mtrr_ops(const struct mtrr_ops *ops);
57 57
58extern u64 size_or_mask, size_and_mask; 58extern u64 size_or_mask, size_and_mask;
59extern struct mtrr_ops *mtrr_if; 59extern const struct mtrr_ops *mtrr_if;
60 60
61#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) 61#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
62#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) 62#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
deleted file mode 100644
index dfc80b4e6b0d..000000000000
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ /dev/null
@@ -1,94 +0,0 @@
1#include <linux/init.h>
2#include <linux/io.h>
3#include <linux/mm.h>
4
5#include <asm/processor-cyrix.h>
6#include <asm/processor-flags.h>
7#include <asm/mtrr.h>
8#include <asm/msr.h>
9
10#include "mtrr.h"
11
12/* Put the processor into a state where MTRRs can be safely set */
13void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
14{
15 unsigned int cr0;
16
17 /* Disable interrupts locally */
18 local_irq_save(ctxt->flags);
19
20 if (use_intel() || is_cpu(CYRIX)) {
21
22 /* Save value of CR4 and clear Page Global Enable (bit 7) */
23 if (cpu_has_pge) {
24 ctxt->cr4val = read_cr4();
25 write_cr4(ctxt->cr4val & ~X86_CR4_PGE);
26 }
27
28 /*
29 * Disable and flush caches. Note that wbinvd flushes the TLBs
30 * as a side-effect
31 */
32 cr0 = read_cr0() | X86_CR0_CD;
33 wbinvd();
34 write_cr0(cr0);
35 wbinvd();
36
37 if (use_intel()) {
38 /* Save MTRR state */
39 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
40 } else {
41 /*
42 * Cyrix ARRs -
43 * everything else were excluded at the top
44 */
45 ctxt->ccr3 = getCx86(CX86_CCR3);
46 }
47 }
48}
49
50void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
51{
52 if (use_intel()) {
53 /* Disable MTRRs, and set the default type to uncached */
54 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
55 ctxt->deftype_hi);
56 } else {
57 if (is_cpu(CYRIX)) {
58 /* Cyrix ARRs - everything else were excluded at the top */
59 setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10);
60 }
61 }
62}
63
64/* Restore the processor after a set_mtrr_prepare */
65void set_mtrr_done(struct set_mtrr_context *ctxt)
66{
67 if (use_intel() || is_cpu(CYRIX)) {
68
69 /* Flush caches and TLBs */
70 wbinvd();
71
72 /* Restore MTRRdefType */
73 if (use_intel()) {
74 /* Intel (P6) standard MTRRs */
75 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo,
76 ctxt->deftype_hi);
77 } else {
78 /*
79 * Cyrix ARRs -
80 * everything else was excluded at the top
81 */
82 setCx86(CX86_CCR3, ctxt->ccr3);
83 }
84
85 /* Enable caches */
86 write_cr0(read_cr0() & 0xbfffffff);
87
88 /* Restore value of CR4 */
89 if (cpu_has_pge)
90 write_cr4(ctxt->cr4val);
91 }
92 /* Re-enable interrupts locally (if enabled previously) */
93 local_irq_restore(ctxt->flags);
94}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8c1c07073ccc..b1fbdeecf6c9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -7,6 +7,7 @@
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> 9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10 * Copyright (C) 2009 Google, Inc., Stephane Eranian
10 * 11 *
11 * For licencing details see kernel-base/COPYING 12 * For licencing details see kernel-base/COPYING
12 */ 13 */
@@ -22,6 +23,7 @@
22#include <linux/uaccess.h> 23#include <linux/uaccess.h>
23#include <linux/highmem.h> 24#include <linux/highmem.h>
24#include <linux/cpu.h> 25#include <linux/cpu.h>
26#include <linux/bitops.h>
25 27
26#include <asm/apic.h> 28#include <asm/apic.h>
27#include <asm/stacktrace.h> 29#include <asm/stacktrace.h>
@@ -68,26 +70,59 @@ struct debug_store {
68 u64 pebs_event_reset[MAX_PEBS_EVENTS]; 70 u64 pebs_event_reset[MAX_PEBS_EVENTS];
69}; 71};
70 72
73struct event_constraint {
74 union {
75 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
76 u64 idxmsk64[1];
77 };
78 int code;
79 int cmask;
80 int weight;
81};
82
83struct amd_nb {
84 int nb_id; /* NorthBridge id */
85 int refcnt; /* reference count */
86 struct perf_event *owners[X86_PMC_IDX_MAX];
87 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
88};
89
71struct cpu_hw_events { 90struct cpu_hw_events {
72 struct perf_event *events[X86_PMC_IDX_MAX]; 91 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
73 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
74 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 92 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
75 unsigned long interrupts; 93 unsigned long interrupts;
76 int enabled; 94 int enabled;
77 struct debug_store *ds; 95 struct debug_store *ds;
78};
79 96
80struct event_constraint { 97 int n_events;
81 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 98 int n_added;
82 int code; 99 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
100 u64 tags[X86_PMC_IDX_MAX];
101 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
102 struct amd_nb *amd_nb;
83}; 103};
84 104
85#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) } 105#define __EVENT_CONSTRAINT(c, n, m, w) {\
86#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 } 106 { .idxmsk64[0] = (n) }, \
107 .code = (c), \
108 .cmask = (m), \
109 .weight = (w), \
110}
111
112#define EVENT_CONSTRAINT(c, n, m) \
113 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
87 114
88#define for_each_event_constraint(e, c) \ 115#define INTEL_EVENT_CONSTRAINT(c, n) \
89 for ((e) = (c); (e)->idxmsk[0]; (e)++) 116 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
90 117
118#define FIXED_EVENT_CONSTRAINT(c, n) \
119 EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK)
120
121#define EVENT_CONSTRAINT_END \
122 EVENT_CONSTRAINT(0, 0, 0)
123
124#define for_each_event_constraint(e, c) \
125 for ((e) = (c); (e)->cmask; (e)++)
91 126
92/* 127/*
93 * struct x86_pmu - generic x86 pmu 128 * struct x86_pmu - generic x86 pmu
@@ -114,8 +149,14 @@ struct x86_pmu {
114 u64 intel_ctrl; 149 u64 intel_ctrl;
115 void (*enable_bts)(u64 config); 150 void (*enable_bts)(u64 config);
116 void (*disable_bts)(void); 151 void (*disable_bts)(void);
117 int (*get_event_idx)(struct cpu_hw_events *cpuc, 152
118 struct hw_perf_event *hwc); 153 struct event_constraint *
154 (*get_event_constraints)(struct cpu_hw_events *cpuc,
155 struct perf_event *event);
156
157 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
158 struct perf_event *event);
159 struct event_constraint *event_constraints;
119}; 160};
120 161
121static struct x86_pmu x86_pmu __read_mostly; 162static struct x86_pmu x86_pmu __read_mostly;
@@ -124,111 +165,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
124 .enabled = 1, 165 .enabled = 1,
125}; 166};
126 167
127static const struct event_constraint *event_constraints; 168static int x86_perf_event_set_period(struct perf_event *event,
128 169 struct hw_perf_event *hwc, int idx);
129/*
130 * Not sure about some of these
131 */
132static const u64 p6_perfmon_event_map[] =
133{
134 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
135 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
136 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
137 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
138 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
139 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
140 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
141};
142
143static u64 p6_pmu_event_map(int hw_event)
144{
145 return p6_perfmon_event_map[hw_event];
146}
147
148/*
149 * Event setting that is specified not to count anything.
150 * We use this to effectively disable a counter.
151 *
152 * L2_RQSTS with 0 MESI unit mask.
153 */
154#define P6_NOP_EVENT 0x0000002EULL
155
156static u64 p6_pmu_raw_event(u64 hw_event)
157{
158#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
159#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
160#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
161#define P6_EVNTSEL_INV_MASK 0x00800000ULL
162#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
163
164#define P6_EVNTSEL_MASK \
165 (P6_EVNTSEL_EVENT_MASK | \
166 P6_EVNTSEL_UNIT_MASK | \
167 P6_EVNTSEL_EDGE_MASK | \
168 P6_EVNTSEL_INV_MASK | \
169 P6_EVNTSEL_REG_MASK)
170
171 return hw_event & P6_EVNTSEL_MASK;
172}
173
174static const struct event_constraint intel_p6_event_constraints[] =
175{
176 EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
177 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
178 EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
179 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
180 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
181 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
182 EVENT_CONSTRAINT_END
183};
184
185/*
186 * Intel PerfMon v3. Used on Core2 and later.
187 */
188static const u64 intel_perfmon_event_map[] =
189{
190 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
191 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
192 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
193 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
194 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
195 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
196 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
197};
198
199static const struct event_constraint intel_core_event_constraints[] =
200{
201 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
202 EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
203 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
204 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
205 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
206 EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
207 EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
208 EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
209 EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
210 EVENT_CONSTRAINT_END
211};
212
213static const struct event_constraint intel_nehalem_event_constraints[] =
214{
215 EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
216 EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
217 EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
218 EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
219 EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
220 EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
221 EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
222 EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
223 EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
224 EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
225 EVENT_CONSTRAINT_END
226};
227
228static u64 intel_pmu_event_map(int hw_event)
229{
230 return intel_perfmon_event_map[hw_event];
231}
232 170
233/* 171/*
234 * Generalized hw caching related hw_event table, filled 172 * Generalized hw caching related hw_event table, filled
@@ -245,424 +183,6 @@ static u64 __read_mostly hw_cache_event_ids
245 [PERF_COUNT_HW_CACHE_OP_MAX] 183 [PERF_COUNT_HW_CACHE_OP_MAX]
246 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 184 [PERF_COUNT_HW_CACHE_RESULT_MAX];
247 185
248static __initconst u64 nehalem_hw_cache_event_ids
249 [PERF_COUNT_HW_CACHE_MAX]
250 [PERF_COUNT_HW_CACHE_OP_MAX]
251 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
252{
253 [ C(L1D) ] = {
254 [ C(OP_READ) ] = {
255 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
256 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
257 },
258 [ C(OP_WRITE) ] = {
259 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
260 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
261 },
262 [ C(OP_PREFETCH) ] = {
263 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
264 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
265 },
266 },
267 [ C(L1I ) ] = {
268 [ C(OP_READ) ] = {
269 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
270 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
271 },
272 [ C(OP_WRITE) ] = {
273 [ C(RESULT_ACCESS) ] = -1,
274 [ C(RESULT_MISS) ] = -1,
275 },
276 [ C(OP_PREFETCH) ] = {
277 [ C(RESULT_ACCESS) ] = 0x0,
278 [ C(RESULT_MISS) ] = 0x0,
279 },
280 },
281 [ C(LL ) ] = {
282 [ C(OP_READ) ] = {
283 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
284 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
285 },
286 [ C(OP_WRITE) ] = {
287 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
288 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
289 },
290 [ C(OP_PREFETCH) ] = {
291 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
292 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
293 },
294 },
295 [ C(DTLB) ] = {
296 [ C(OP_READ) ] = {
297 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
298 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
299 },
300 [ C(OP_WRITE) ] = {
301 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
302 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
303 },
304 [ C(OP_PREFETCH) ] = {
305 [ C(RESULT_ACCESS) ] = 0x0,
306 [ C(RESULT_MISS) ] = 0x0,
307 },
308 },
309 [ C(ITLB) ] = {
310 [ C(OP_READ) ] = {
311 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
312 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
313 },
314 [ C(OP_WRITE) ] = {
315 [ C(RESULT_ACCESS) ] = -1,
316 [ C(RESULT_MISS) ] = -1,
317 },
318 [ C(OP_PREFETCH) ] = {
319 [ C(RESULT_ACCESS) ] = -1,
320 [ C(RESULT_MISS) ] = -1,
321 },
322 },
323 [ C(BPU ) ] = {
324 [ C(OP_READ) ] = {
325 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
326 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
327 },
328 [ C(OP_WRITE) ] = {
329 [ C(RESULT_ACCESS) ] = -1,
330 [ C(RESULT_MISS) ] = -1,
331 },
332 [ C(OP_PREFETCH) ] = {
333 [ C(RESULT_ACCESS) ] = -1,
334 [ C(RESULT_MISS) ] = -1,
335 },
336 },
337};
338
339static __initconst u64 core2_hw_cache_event_ids
340 [PERF_COUNT_HW_CACHE_MAX]
341 [PERF_COUNT_HW_CACHE_OP_MAX]
342 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
343{
344 [ C(L1D) ] = {
345 [ C(OP_READ) ] = {
346 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
347 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
348 },
349 [ C(OP_WRITE) ] = {
350 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
351 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
352 },
353 [ C(OP_PREFETCH) ] = {
354 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
355 [ C(RESULT_MISS) ] = 0,
356 },
357 },
358 [ C(L1I ) ] = {
359 [ C(OP_READ) ] = {
360 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
361 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
362 },
363 [ C(OP_WRITE) ] = {
364 [ C(RESULT_ACCESS) ] = -1,
365 [ C(RESULT_MISS) ] = -1,
366 },
367 [ C(OP_PREFETCH) ] = {
368 [ C(RESULT_ACCESS) ] = 0,
369 [ C(RESULT_MISS) ] = 0,
370 },
371 },
372 [ C(LL ) ] = {
373 [ C(OP_READ) ] = {
374 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
375 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
376 },
377 [ C(OP_WRITE) ] = {
378 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
379 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
380 },
381 [ C(OP_PREFETCH) ] = {
382 [ C(RESULT_ACCESS) ] = 0,
383 [ C(RESULT_MISS) ] = 0,
384 },
385 },
386 [ C(DTLB) ] = {
387 [ C(OP_READ) ] = {
388 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
389 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
390 },
391 [ C(OP_WRITE) ] = {
392 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
393 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
394 },
395 [ C(OP_PREFETCH) ] = {
396 [ C(RESULT_ACCESS) ] = 0,
397 [ C(RESULT_MISS) ] = 0,
398 },
399 },
400 [ C(ITLB) ] = {
401 [ C(OP_READ) ] = {
402 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
403 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
404 },
405 [ C(OP_WRITE) ] = {
406 [ C(RESULT_ACCESS) ] = -1,
407 [ C(RESULT_MISS) ] = -1,
408 },
409 [ C(OP_PREFETCH) ] = {
410 [ C(RESULT_ACCESS) ] = -1,
411 [ C(RESULT_MISS) ] = -1,
412 },
413 },
414 [ C(BPU ) ] = {
415 [ C(OP_READ) ] = {
416 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
417 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
418 },
419 [ C(OP_WRITE) ] = {
420 [ C(RESULT_ACCESS) ] = -1,
421 [ C(RESULT_MISS) ] = -1,
422 },
423 [ C(OP_PREFETCH) ] = {
424 [ C(RESULT_ACCESS) ] = -1,
425 [ C(RESULT_MISS) ] = -1,
426 },
427 },
428};
429
430static __initconst u64 atom_hw_cache_event_ids
431 [PERF_COUNT_HW_CACHE_MAX]
432 [PERF_COUNT_HW_CACHE_OP_MAX]
433 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
434{
435 [ C(L1D) ] = {
436 [ C(OP_READ) ] = {
437 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
438 [ C(RESULT_MISS) ] = 0,
439 },
440 [ C(OP_WRITE) ] = {
441 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
442 [ C(RESULT_MISS) ] = 0,
443 },
444 [ C(OP_PREFETCH) ] = {
445 [ C(RESULT_ACCESS) ] = 0x0,
446 [ C(RESULT_MISS) ] = 0,
447 },
448 },
449 [ C(L1I ) ] = {
450 [ C(OP_READ) ] = {
451 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
452 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
453 },
454 [ C(OP_WRITE) ] = {
455 [ C(RESULT_ACCESS) ] = -1,
456 [ C(RESULT_MISS) ] = -1,
457 },
458 [ C(OP_PREFETCH) ] = {
459 [ C(RESULT_ACCESS) ] = 0,
460 [ C(RESULT_MISS) ] = 0,
461 },
462 },
463 [ C(LL ) ] = {
464 [ C(OP_READ) ] = {
465 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
466 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
467 },
468 [ C(OP_WRITE) ] = {
469 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
470 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
471 },
472 [ C(OP_PREFETCH) ] = {
473 [ C(RESULT_ACCESS) ] = 0,
474 [ C(RESULT_MISS) ] = 0,
475 },
476 },
477 [ C(DTLB) ] = {
478 [ C(OP_READ) ] = {
479 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
480 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
481 },
482 [ C(OP_WRITE) ] = {
483 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
484 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
485 },
486 [ C(OP_PREFETCH) ] = {
487 [ C(RESULT_ACCESS) ] = 0,
488 [ C(RESULT_MISS) ] = 0,
489 },
490 },
491 [ C(ITLB) ] = {
492 [ C(OP_READ) ] = {
493 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
494 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
495 },
496 [ C(OP_WRITE) ] = {
497 [ C(RESULT_ACCESS) ] = -1,
498 [ C(RESULT_MISS) ] = -1,
499 },
500 [ C(OP_PREFETCH) ] = {
501 [ C(RESULT_ACCESS) ] = -1,
502 [ C(RESULT_MISS) ] = -1,
503 },
504 },
505 [ C(BPU ) ] = {
506 [ C(OP_READ) ] = {
507 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
508 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
509 },
510 [ C(OP_WRITE) ] = {
511 [ C(RESULT_ACCESS) ] = -1,
512 [ C(RESULT_MISS) ] = -1,
513 },
514 [ C(OP_PREFETCH) ] = {
515 [ C(RESULT_ACCESS) ] = -1,
516 [ C(RESULT_MISS) ] = -1,
517 },
518 },
519};
520
521static u64 intel_pmu_raw_event(u64 hw_event)
522{
523#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
524#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
525#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
526#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
527#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
528
529#define CORE_EVNTSEL_MASK \
530 (CORE_EVNTSEL_EVENT_MASK | \
531 CORE_EVNTSEL_UNIT_MASK | \
532 CORE_EVNTSEL_EDGE_MASK | \
533 CORE_EVNTSEL_INV_MASK | \
534 CORE_EVNTSEL_REG_MASK)
535
536 return hw_event & CORE_EVNTSEL_MASK;
537}
538
539static __initconst u64 amd_hw_cache_event_ids
540 [PERF_COUNT_HW_CACHE_MAX]
541 [PERF_COUNT_HW_CACHE_OP_MAX]
542 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
543{
544 [ C(L1D) ] = {
545 [ C(OP_READ) ] = {
546 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
547 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
548 },
549 [ C(OP_WRITE) ] = {
550 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
551 [ C(RESULT_MISS) ] = 0,
552 },
553 [ C(OP_PREFETCH) ] = {
554 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
555 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
556 },
557 },
558 [ C(L1I ) ] = {
559 [ C(OP_READ) ] = {
560 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
561 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
562 },
563 [ C(OP_WRITE) ] = {
564 [ C(RESULT_ACCESS) ] = -1,
565 [ C(RESULT_MISS) ] = -1,
566 },
567 [ C(OP_PREFETCH) ] = {
568 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
569 [ C(RESULT_MISS) ] = 0,
570 },
571 },
572 [ C(LL ) ] = {
573 [ C(OP_READ) ] = {
574 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
575 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
576 },
577 [ C(OP_WRITE) ] = {
578 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
579 [ C(RESULT_MISS) ] = 0,
580 },
581 [ C(OP_PREFETCH) ] = {
582 [ C(RESULT_ACCESS) ] = 0,
583 [ C(RESULT_MISS) ] = 0,
584 },
585 },
586 [ C(DTLB) ] = {
587 [ C(OP_READ) ] = {
588 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
589 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
590 },
591 [ C(OP_WRITE) ] = {
592 [ C(RESULT_ACCESS) ] = 0,
593 [ C(RESULT_MISS) ] = 0,
594 },
595 [ C(OP_PREFETCH) ] = {
596 [ C(RESULT_ACCESS) ] = 0,
597 [ C(RESULT_MISS) ] = 0,
598 },
599 },
600 [ C(ITLB) ] = {
601 [ C(OP_READ) ] = {
602 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
603 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
604 },
605 [ C(OP_WRITE) ] = {
606 [ C(RESULT_ACCESS) ] = -1,
607 [ C(RESULT_MISS) ] = -1,
608 },
609 [ C(OP_PREFETCH) ] = {
610 [ C(RESULT_ACCESS) ] = -1,
611 [ C(RESULT_MISS) ] = -1,
612 },
613 },
614 [ C(BPU ) ] = {
615 [ C(OP_READ) ] = {
616 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
617 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
618 },
619 [ C(OP_WRITE) ] = {
620 [ C(RESULT_ACCESS) ] = -1,
621 [ C(RESULT_MISS) ] = -1,
622 },
623 [ C(OP_PREFETCH) ] = {
624 [ C(RESULT_ACCESS) ] = -1,
625 [ C(RESULT_MISS) ] = -1,
626 },
627 },
628};
629
630/*
631 * AMD Performance Monitor K7 and later.
632 */
633static const u64 amd_perfmon_event_map[] =
634{
635 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
636 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
637 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
638 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
639 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
640 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
641};
642
643static u64 amd_pmu_event_map(int hw_event)
644{
645 return amd_perfmon_event_map[hw_event];
646}
647
648static u64 amd_pmu_raw_event(u64 hw_event)
649{
650#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
651#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
652#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
653#define K7_EVNTSEL_INV_MASK 0x000800000ULL
654#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
655
656#define K7_EVNTSEL_MASK \
657 (K7_EVNTSEL_EVENT_MASK | \
658 K7_EVNTSEL_UNIT_MASK | \
659 K7_EVNTSEL_EDGE_MASK | \
660 K7_EVNTSEL_INV_MASK | \
661 K7_EVNTSEL_REG_MASK)
662
663 return hw_event & K7_EVNTSEL_MASK;
664}
665
666/* 186/*
667 * Propagate event elapsed time into the generic event. 187 * Propagate event elapsed time into the generic event.
668 * Can only be executed on the CPU where the event is active. 188 * Can only be executed on the CPU where the event is active.
@@ -914,42 +434,6 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
914 return 0; 434 return 0;
915} 435}
916 436
917static void intel_pmu_enable_bts(u64 config)
918{
919 unsigned long debugctlmsr;
920
921 debugctlmsr = get_debugctlmsr();
922
923 debugctlmsr |= X86_DEBUGCTL_TR;
924 debugctlmsr |= X86_DEBUGCTL_BTS;
925 debugctlmsr |= X86_DEBUGCTL_BTINT;
926
927 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
928 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
929
930 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
931 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
932
933 update_debugctlmsr(debugctlmsr);
934}
935
936static void intel_pmu_disable_bts(void)
937{
938 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
939 unsigned long debugctlmsr;
940
941 if (!cpuc->ds)
942 return;
943
944 debugctlmsr = get_debugctlmsr();
945
946 debugctlmsr &=
947 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
948 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
949
950 update_debugctlmsr(debugctlmsr);
951}
952
953/* 437/*
954 * Setup the hardware configuration for a given attr_type 438 * Setup the hardware configuration for a given attr_type
955 */ 439 */
@@ -988,6 +472,8 @@ static int __hw_perf_event_init(struct perf_event *event)
988 hwc->config = ARCH_PERFMON_EVENTSEL_INT; 472 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
989 473
990 hwc->idx = -1; 474 hwc->idx = -1;
475 hwc->last_cpu = -1;
476 hwc->last_tag = ~0ULL;
991 477
992 /* 478 /*
993 * Count user and OS events unless requested not to. 479 * Count user and OS events unless requested not to.
@@ -1056,216 +542,323 @@ static int __hw_perf_event_init(struct perf_event *event)
1056 return 0; 542 return 0;
1057} 543}
1058 544
1059static void p6_pmu_disable_all(void) 545static void x86_pmu_disable_all(void)
1060{ 546{
1061 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 547 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1062 u64 val; 548 int idx;
1063
1064 if (!cpuc->enabled)
1065 return;
1066 549
1067 cpuc->enabled = 0; 550 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1068 barrier(); 551 u64 val;
1069 552
1070 /* p6 only has one enable register */ 553 if (!test_bit(idx, cpuc->active_mask))
1071 rdmsrl(MSR_P6_EVNTSEL0, val); 554 continue;
1072 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 555 rdmsrl(x86_pmu.eventsel + idx, val);
1073 wrmsrl(MSR_P6_EVNTSEL0, val); 556 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
557 continue;
558 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
559 wrmsrl(x86_pmu.eventsel + idx, val);
560 }
1074} 561}
1075 562
1076static void intel_pmu_disable_all(void) 563void hw_perf_disable(void)
1077{ 564{
1078 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 565 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1079 566
567 if (!x86_pmu_initialized())
568 return;
569
1080 if (!cpuc->enabled) 570 if (!cpuc->enabled)
1081 return; 571 return;
1082 572
573 cpuc->n_added = 0;
1083 cpuc->enabled = 0; 574 cpuc->enabled = 0;
1084 barrier(); 575 barrier();
1085 576
1086 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 577 x86_pmu.disable_all();
1087
1088 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
1089 intel_pmu_disable_bts();
1090} 578}
1091 579
1092static void amd_pmu_disable_all(void) 580static void x86_pmu_enable_all(void)
1093{ 581{
1094 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 582 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1095 int idx; 583 int idx;
1096 584
1097 if (!cpuc->enabled)
1098 return;
1099
1100 cpuc->enabled = 0;
1101 /*
1102 * ensure we write the disable before we start disabling the
1103 * events proper, so that amd_pmu_enable_event() does the
1104 * right thing.
1105 */
1106 barrier();
1107
1108 for (idx = 0; idx < x86_pmu.num_events; idx++) { 585 for (idx = 0; idx < x86_pmu.num_events; idx++) {
586 struct perf_event *event = cpuc->events[idx];
1109 u64 val; 587 u64 val;
1110 588
1111 if (!test_bit(idx, cpuc->active_mask)) 589 if (!test_bit(idx, cpuc->active_mask))
1112 continue; 590 continue;
1113 rdmsrl(MSR_K7_EVNTSEL0 + idx, val); 591
1114 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) 592 val = event->hw.config;
1115 continue; 593 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
1116 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 594 wrmsrl(x86_pmu.eventsel + idx, val);
1117 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
1118 } 595 }
1119} 596}
1120 597
1121void hw_perf_disable(void) 598static const struct pmu pmu;
599
600static inline int is_x86_event(struct perf_event *event)
1122{ 601{
1123 if (!x86_pmu_initialized()) 602 return event->pmu == &pmu;
1124 return;
1125 return x86_pmu.disable_all();
1126} 603}
1127 604
1128static void p6_pmu_enable_all(void) 605static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
1129{ 606{
1130 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 607 struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
1131 unsigned long val; 608 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
609 int i, j, w, wmax, num = 0;
610 struct hw_perf_event *hwc;
1132 611
1133 if (cpuc->enabled) 612 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
1134 return;
1135 613
1136 cpuc->enabled = 1; 614 for (i = 0; i < n; i++) {
1137 barrier(); 615 constraints[i] =
616 x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
617 }
1138 618
1139 /* p6 only has one enable register */ 619 /*
1140 rdmsrl(MSR_P6_EVNTSEL0, val); 620 * fastpath, try to reuse previous register
1141 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 621 */
1142 wrmsrl(MSR_P6_EVNTSEL0, val); 622 for (i = 0; i < n; i++) {
1143} 623 hwc = &cpuc->event_list[i]->hw;
624 c = constraints[i];
1144 625
1145static void intel_pmu_enable_all(void) 626 /* never assigned */
1146{ 627 if (hwc->idx == -1)
1147 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 628 break;
1148 629
1149 if (cpuc->enabled) 630 /* constraint still honored */
1150 return; 631 if (!test_bit(hwc->idx, c->idxmsk))
632 break;
1151 633
1152 cpuc->enabled = 1; 634 /* not already used */
1153 barrier(); 635 if (test_bit(hwc->idx, used_mask))
636 break;
637
638 set_bit(hwc->idx, used_mask);
639 if (assign)
640 assign[i] = hwc->idx;
641 }
642 if (i == n)
643 goto done;
644
645 /*
646 * begin slow path
647 */
648
649 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
650
651 /*
652 * weight = number of possible counters
653 *
654 * 1 = most constrained, only works on one counter
655 * wmax = least constrained, works on any counter
656 *
657 * assign events to counters starting with most
658 * constrained events.
659 */
660 wmax = x86_pmu.num_events;
661
662 /*
663 * when fixed event counters are present,
664 * wmax is incremented by 1 to account
665 * for one more choice
666 */
667 if (x86_pmu.num_events_fixed)
668 wmax++;
1154 669
1155 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 670 for (w = 1, num = n; num && w <= wmax; w++) {
671 /* for each event */
672 for (i = 0; num && i < n; i++) {
673 c = constraints[i];
674 hwc = &cpuc->event_list[i]->hw;
1156 675
1157 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 676 if (c->weight != w)
1158 struct perf_event *event = 677 continue;
1159 cpuc->events[X86_PMC_IDX_FIXED_BTS];
1160 678
1161 if (WARN_ON_ONCE(!event)) 679 for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
1162 return; 680 if (!test_bit(j, used_mask))
681 break;
682 }
683
684 if (j == X86_PMC_IDX_MAX)
685 break;
686
687 set_bit(j, used_mask);
1163 688
1164 intel_pmu_enable_bts(event->hw.config); 689 if (assign)
690 assign[i] = j;
691 num--;
692 }
693 }
694done:
695 /*
696 * scheduling failed or is just a simulation,
697 * free resources if necessary
698 */
699 if (!assign || num) {
700 for (i = 0; i < n; i++) {
701 if (x86_pmu.put_event_constraints)
702 x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
703 }
1165 } 704 }
705 return num ? -ENOSPC : 0;
1166} 706}
1167 707
1168static void amd_pmu_enable_all(void) 708/*
709 * dogrp: true if must collect siblings events (group)
710 * returns total number of events and error code
711 */
712static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
1169{ 713{
1170 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 714 struct perf_event *event;
1171 int idx; 715 int n, max_count;
1172 716
1173 if (cpuc->enabled) 717 max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
1174 return;
1175 718
1176 cpuc->enabled = 1; 719 /* current number of events already accepted */
1177 barrier(); 720 n = cpuc->n_events;
1178 721
1179 for (idx = 0; idx < x86_pmu.num_events; idx++) { 722 if (is_x86_event(leader)) {
1180 struct perf_event *event = cpuc->events[idx]; 723 if (n >= max_count)
1181 u64 val; 724 return -ENOSPC;
725 cpuc->event_list[n] = leader;
726 n++;
727 }
728 if (!dogrp)
729 return n;
1182 730
1183 if (!test_bit(idx, cpuc->active_mask)) 731 list_for_each_entry(event, &leader->sibling_list, group_entry) {
732 if (!is_x86_event(event) ||
733 event->state <= PERF_EVENT_STATE_OFF)
1184 continue; 734 continue;
1185 735
1186 val = event->hw.config; 736 if (n >= max_count)
1187 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 737 return -ENOSPC;
1188 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
1189 }
1190}
1191 738
1192void hw_perf_enable(void) 739 cpuc->event_list[n] = event;
1193{ 740 n++;
1194 if (!x86_pmu_initialized()) 741 }
1195 return; 742 return n;
1196 x86_pmu.enable_all();
1197} 743}
1198 744
1199static inline u64 intel_pmu_get_status(void) 745static inline void x86_assign_hw_event(struct perf_event *event,
746 struct cpu_hw_events *cpuc, int i)
1200{ 747{
1201 u64 status; 748 struct hw_perf_event *hwc = &event->hw;
1202 749
1203 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 750 hwc->idx = cpuc->assign[i];
751 hwc->last_cpu = smp_processor_id();
752 hwc->last_tag = ++cpuc->tags[i];
1204 753
1205 return status; 754 if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
755 hwc->config_base = 0;
756 hwc->event_base = 0;
757 } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
758 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
759 /*
760 * We set it so that event_base + idx in wrmsr/rdmsr maps to
761 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
762 */
763 hwc->event_base =
764 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
765 } else {
766 hwc->config_base = x86_pmu.eventsel;
767 hwc->event_base = x86_pmu.perfctr;
768 }
1206} 769}
1207 770
1208static inline void intel_pmu_ack_status(u64 ack) 771static inline int match_prev_assignment(struct hw_perf_event *hwc,
772 struct cpu_hw_events *cpuc,
773 int i)
1209{ 774{
1210 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 775 return hwc->idx == cpuc->assign[i] &&
776 hwc->last_cpu == smp_processor_id() &&
777 hwc->last_tag == cpuc->tags[i];
1211} 778}
1212 779
1213static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) 780static void x86_pmu_stop(struct perf_event *event);
1214{
1215 (void)checking_wrmsrl(hwc->config_base + idx,
1216 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
1217}
1218 781
1219static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) 782void hw_perf_enable(void)
1220{ 783{
1221 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); 784 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1222} 785 struct perf_event *event;
786 struct hw_perf_event *hwc;
787 int i;
1223 788
1224static inline void 789 if (!x86_pmu_initialized())
1225intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) 790 return;
1226{
1227 int idx = __idx - X86_PMC_IDX_FIXED;
1228 u64 ctrl_val, mask;
1229 791
1230 mask = 0xfULL << (idx * 4); 792 if (cpuc->enabled)
793 return;
1231 794
1232 rdmsrl(hwc->config_base, ctrl_val); 795 if (cpuc->n_added) {
1233 ctrl_val &= ~mask; 796 /*
1234 (void)checking_wrmsrl(hwc->config_base, ctrl_val); 797 * apply assignment obtained either from
1235} 798 * hw_perf_group_sched_in() or x86_pmu_enable()
799 *
800 * step1: save events moving to new counters
801 * step2: reprogram moved events into new counters
802 */
803 for (i = 0; i < cpuc->n_events; i++) {
1236 804
1237static inline void 805 event = cpuc->event_list[i];
1238p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) 806 hwc = &event->hw;
1239{
1240 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1241 u64 val = P6_NOP_EVENT;
1242 807
1243 if (cpuc->enabled) 808 /*
1244 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 809 * we can avoid reprogramming counter if:
810 * - assigned same counter as last time
811 * - running on same CPU as last time
812 * - no other event has used the counter since
813 */
814 if (hwc->idx == -1 ||
815 match_prev_assignment(hwc, cpuc, i))
816 continue;
1245 817
1246 (void)checking_wrmsrl(hwc->config_base + idx, val); 818 x86_pmu_stop(event);
1247}
1248 819
1249static inline void 820 hwc->idx = -1;
1250intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) 821 }
1251{
1252 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
1253 intel_pmu_disable_bts();
1254 return;
1255 }
1256 822
1257 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 823 for (i = 0; i < cpuc->n_events; i++) {
1258 intel_pmu_disable_fixed(hwc, idx); 824
1259 return; 825 event = cpuc->event_list[i];
826 hwc = &event->hw;
827
828 if (hwc->idx == -1) {
829 x86_assign_hw_event(event, cpuc, i);
830 x86_perf_event_set_period(event, hwc, hwc->idx);
831 }
832 /*
833 * need to mark as active because x86_pmu_disable()
834 * clear active_mask and events[] yet it preserves
835 * idx
836 */
837 set_bit(hwc->idx, cpuc->active_mask);
838 cpuc->events[hwc->idx] = event;
839
840 x86_pmu.enable(hwc, hwc->idx);
841 perf_event_update_userpage(event);
842 }
843 cpuc->n_added = 0;
844 perf_events_lapic_init();
1260 } 845 }
1261 846
1262 x86_pmu_disable_event(hwc, idx); 847 cpuc->enabled = 1;
848 barrier();
849
850 x86_pmu.enable_all();
851}
852
853static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, int idx)
854{
855 (void)checking_wrmsrl(hwc->config_base + idx,
856 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
1263} 857}
1264 858
1265static inline void 859static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1266amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1267{ 860{
1268 x86_pmu_disable_event(hwc, idx); 861 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
1269} 862}
1270 863
1271static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 864static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -1326,220 +919,60 @@ x86_perf_event_set_period(struct perf_event *event,
1326 return ret; 919 return ret;
1327} 920}
1328 921
1329static inline void 922static void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1330intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
1331{
1332 int idx = __idx - X86_PMC_IDX_FIXED;
1333 u64 ctrl_val, bits, mask;
1334 int err;
1335
1336 /*
1337 * Enable IRQ generation (0x8),
1338 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
1339 * if requested:
1340 */
1341 bits = 0x8ULL;
1342 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
1343 bits |= 0x2;
1344 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
1345 bits |= 0x1;
1346
1347 /*
1348 * ANY bit is supported in v3 and up
1349 */
1350 if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
1351 bits |= 0x4;
1352
1353 bits <<= (idx * 4);
1354 mask = 0xfULL << (idx * 4);
1355
1356 rdmsrl(hwc->config_base, ctrl_val);
1357 ctrl_val &= ~mask;
1358 ctrl_val |= bits;
1359 err = checking_wrmsrl(hwc->config_base, ctrl_val);
1360}
1361
1362static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1363{ 923{
1364 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 924 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1365 u64 val;
1366
1367 val = hwc->config;
1368 if (cpuc->enabled) 925 if (cpuc->enabled)
1369 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 926 __x86_pmu_enable_event(hwc, idx);
1370
1371 (void)checking_wrmsrl(hwc->config_base + idx, val);
1372} 927}
1373 928
1374 929/*
1375static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) 930 * activate a single event
1376{ 931 *
1377 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 932 * The event is added to the group of enabled events
1378 if (!__get_cpu_var(cpu_hw_events).enabled) 933 * but only if it can be scehduled with existing events.
1379 return; 934 *
1380 935 * Called with PMU disabled. If successful and return value 1,
1381 intel_pmu_enable_bts(hwc->config); 936 * then guaranteed to call perf_enable() and hw_perf_enable()
1382 return; 937 */
1383 } 938static int x86_pmu_enable(struct perf_event *event)
1384
1385 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
1386 intel_pmu_enable_fixed(hwc, idx);
1387 return;
1388 }
1389
1390 x86_pmu_enable_event(hwc, idx);
1391}
1392
1393static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1394{ 939{
1395 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 940 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
941 struct hw_perf_event *hwc;
942 int assign[X86_PMC_IDX_MAX];
943 int n, n0, ret;
1396 944
1397 if (cpuc->enabled) 945 hwc = &event->hw;
1398 x86_pmu_enable_event(hwc, idx);
1399}
1400
1401static int fixed_mode_idx(struct hw_perf_event *hwc)
1402{
1403 unsigned int hw_event;
1404
1405 hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
1406
1407 if (unlikely((hw_event ==
1408 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
1409 (hwc->sample_period == 1)))
1410 return X86_PMC_IDX_FIXED_BTS;
1411 946
1412 if (!x86_pmu.num_events_fixed) 947 n0 = cpuc->n_events;
1413 return -1; 948 n = collect_events(cpuc, event, false);
949 if (n < 0)
950 return n;
1414 951
952 ret = x86_schedule_events(cpuc, n, assign);
953 if (ret)
954 return ret;
1415 /* 955 /*
1416 * fixed counters do not take all possible filters 956 * copy new assignment, now we know it is possible
957 * will be used by hw_perf_enable()
1417 */ 958 */
1418 if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK) 959 memcpy(cpuc->assign, assign, n*sizeof(int));
1419 return -1;
1420 960
1421 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 961 cpuc->n_events = n;
1422 return X86_PMC_IDX_FIXED_INSTRUCTIONS; 962 cpuc->n_added = n - n0;
1423 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
1424 return X86_PMC_IDX_FIXED_CPU_CYCLES;
1425 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
1426 return X86_PMC_IDX_FIXED_BUS_CYCLES;
1427 963
1428 return -1; 964 return 0;
1429}
1430
1431/*
1432 * generic counter allocator: get next free counter
1433 */
1434static int
1435gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1436{
1437 int idx;
1438
1439 idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
1440 return idx == x86_pmu.num_events ? -1 : idx;
1441}
1442
1443/*
1444 * intel-specific counter allocator: check event constraints
1445 */
1446static int
1447intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1448{
1449 const struct event_constraint *event_constraint;
1450 int i, code;
1451
1452 if (!event_constraints)
1453 goto skip;
1454
1455 code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
1456
1457 for_each_event_constraint(event_constraint, event_constraints) {
1458 if (code == event_constraint->code) {
1459 for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
1460 if (!test_and_set_bit(i, cpuc->used_mask))
1461 return i;
1462 }
1463 return -1;
1464 }
1465 }
1466skip:
1467 return gen_get_event_idx(cpuc, hwc);
1468}
1469
1470static int
1471x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1472{
1473 int idx;
1474
1475 idx = fixed_mode_idx(hwc);
1476 if (idx == X86_PMC_IDX_FIXED_BTS) {
1477 /* BTS is already occupied. */
1478 if (test_and_set_bit(idx, cpuc->used_mask))
1479 return -EAGAIN;
1480
1481 hwc->config_base = 0;
1482 hwc->event_base = 0;
1483 hwc->idx = idx;
1484 } else if (idx >= 0) {
1485 /*
1486 * Try to get the fixed event, if that is already taken
1487 * then try to get a generic event:
1488 */
1489 if (test_and_set_bit(idx, cpuc->used_mask))
1490 goto try_generic;
1491
1492 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1493 /*
1494 * We set it so that event_base + idx in wrmsr/rdmsr maps to
1495 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1496 */
1497 hwc->event_base =
1498 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1499 hwc->idx = idx;
1500 } else {
1501 idx = hwc->idx;
1502 /* Try to get the previous generic event again */
1503 if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
1504try_generic:
1505 idx = x86_pmu.get_event_idx(cpuc, hwc);
1506 if (idx == -1)
1507 return -EAGAIN;
1508
1509 set_bit(idx, cpuc->used_mask);
1510 hwc->idx = idx;
1511 }
1512 hwc->config_base = x86_pmu.eventsel;
1513 hwc->event_base = x86_pmu.perfctr;
1514 }
1515
1516 return idx;
1517} 965}
1518 966
1519/* 967static int x86_pmu_start(struct perf_event *event)
1520 * Find a PMC slot for the freshly enabled / scheduled in event:
1521 */
1522static int x86_pmu_enable(struct perf_event *event)
1523{ 968{
1524 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1525 struct hw_perf_event *hwc = &event->hw; 969 struct hw_perf_event *hwc = &event->hw;
1526 int idx;
1527
1528 idx = x86_schedule_event(cpuc, hwc);
1529 if (idx < 0)
1530 return idx;
1531 970
1532 perf_events_lapic_init(); 971 if (hwc->idx == -1)
1533 972 return -EAGAIN;
1534 x86_pmu.disable(hwc, idx);
1535 973
1536 cpuc->events[idx] = event; 974 x86_perf_event_set_period(event, hwc, hwc->idx);
1537 set_bit(idx, cpuc->active_mask); 975 x86_pmu.enable(hwc, hwc->idx);
1538
1539 x86_perf_event_set_period(event, hwc, idx);
1540 x86_pmu.enable(hwc, idx);
1541
1542 perf_event_update_userpage(event);
1543 976
1544 return 0; 977 return 0;
1545} 978}
@@ -1583,7 +1016,7 @@ void perf_event_print_debug(void)
1583 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1016 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1584 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1017 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1585 } 1018 }
1586 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); 1019 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1587 1020
1588 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1021 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1589 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1022 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
@@ -1607,67 +1040,7 @@ void perf_event_print_debug(void)
1607 local_irq_restore(flags); 1040 local_irq_restore(flags);
1608} 1041}
1609 1042
1610static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) 1043static void x86_pmu_stop(struct perf_event *event)
1611{
1612 struct debug_store *ds = cpuc->ds;
1613 struct bts_record {
1614 u64 from;
1615 u64 to;
1616 u64 flags;
1617 };
1618 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
1619 struct bts_record *at, *top;
1620 struct perf_output_handle handle;
1621 struct perf_event_header header;
1622 struct perf_sample_data data;
1623 struct pt_regs regs;
1624
1625 if (!event)
1626 return;
1627
1628 if (!ds)
1629 return;
1630
1631 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
1632 top = (struct bts_record *)(unsigned long)ds->bts_index;
1633
1634 if (top <= at)
1635 return;
1636
1637 ds->bts_index = ds->bts_buffer_base;
1638
1639
1640 data.period = event->hw.last_period;
1641 data.addr = 0;
1642 data.raw = NULL;
1643 regs.ip = 0;
1644
1645 /*
1646 * Prepare a generic sample, i.e. fill in the invariant fields.
1647 * We will overwrite the from and to address before we output
1648 * the sample.
1649 */
1650 perf_prepare_sample(&header, &data, event, &regs);
1651
1652 if (perf_output_begin(&handle, event,
1653 header.size * (top - at), 1, 1))
1654 return;
1655
1656 for (; at < top; at++) {
1657 data.ip = at->from;
1658 data.addr = at->to;
1659
1660 perf_output_sample(&handle, &header, &data, event);
1661 }
1662
1663 perf_output_end(&handle);
1664
1665 /* There's new data available. */
1666 event->hw.interrupts++;
1667 event->pending_kill = POLL_IN;
1668}
1669
1670static void x86_pmu_disable(struct perf_event *event)
1671{ 1044{
1672 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1045 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1673 struct hw_perf_event *hwc = &event->hw; 1046 struct hw_perf_event *hwc = &event->hw;
@@ -1681,183 +1054,38 @@ static void x86_pmu_disable(struct perf_event *event)
1681 x86_pmu.disable(hwc, idx); 1054 x86_pmu.disable(hwc, idx);
1682 1055
1683 /* 1056 /*
1684 * Make sure the cleared pointer becomes visible before we
1685 * (potentially) free the event:
1686 */
1687 barrier();
1688
1689 /*
1690 * Drain the remaining delta count out of a event 1057 * Drain the remaining delta count out of a event
1691 * that we are disabling: 1058 * that we are disabling:
1692 */ 1059 */
1693 x86_perf_event_update(event, hwc, idx); 1060 x86_perf_event_update(event, hwc, idx);
1694 1061
1695 /* Drain the remaining BTS records. */
1696 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
1697 intel_pmu_drain_bts_buffer(cpuc);
1698
1699 cpuc->events[idx] = NULL; 1062 cpuc->events[idx] = NULL;
1700 clear_bit(idx, cpuc->used_mask);
1701
1702 perf_event_update_userpage(event);
1703}
1704
1705/*
1706 * Save and restart an expired event. Called by NMI contexts,
1707 * so it has to be careful about preempting normal event ops:
1708 */
1709static int intel_pmu_save_and_restart(struct perf_event *event)
1710{
1711 struct hw_perf_event *hwc = &event->hw;
1712 int idx = hwc->idx;
1713 int ret;
1714
1715 x86_perf_event_update(event, hwc, idx);
1716 ret = x86_perf_event_set_period(event, hwc, idx);
1717
1718 if (event->state == PERF_EVENT_STATE_ACTIVE)
1719 intel_pmu_enable_event(hwc, idx);
1720
1721 return ret;
1722} 1063}
1723 1064
1724static void intel_pmu_reset(void) 1065static void x86_pmu_disable(struct perf_event *event)
1725{
1726 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
1727 unsigned long flags;
1728 int idx;
1729
1730 if (!x86_pmu.num_events)
1731 return;
1732
1733 local_irq_save(flags);
1734
1735 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1736
1737 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1738 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
1739 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1740 }
1741 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1742 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1743 }
1744 if (ds)
1745 ds->bts_index = ds->bts_buffer_base;
1746
1747 local_irq_restore(flags);
1748}
1749
1750static int p6_pmu_handle_irq(struct pt_regs *regs)
1751{
1752 struct perf_sample_data data;
1753 struct cpu_hw_events *cpuc;
1754 struct perf_event *event;
1755 struct hw_perf_event *hwc;
1756 int idx, handled = 0;
1757 u64 val;
1758
1759 data.addr = 0;
1760 data.raw = NULL;
1761
1762 cpuc = &__get_cpu_var(cpu_hw_events);
1763
1764 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1765 if (!test_bit(idx, cpuc->active_mask))
1766 continue;
1767
1768 event = cpuc->events[idx];
1769 hwc = &event->hw;
1770
1771 val = x86_perf_event_update(event, hwc, idx);
1772 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1773 continue;
1774
1775 /*
1776 * event overflow
1777 */
1778 handled = 1;
1779 data.period = event->hw.last_period;
1780
1781 if (!x86_perf_event_set_period(event, hwc, idx))
1782 continue;
1783
1784 if (perf_event_overflow(event, 1, &data, regs))
1785 p6_pmu_disable_event(hwc, idx);
1786 }
1787
1788 if (handled)
1789 inc_irq_stat(apic_perf_irqs);
1790
1791 return handled;
1792}
1793
1794/*
1795 * This handler is triggered by the local APIC, so the APIC IRQ handling
1796 * rules apply:
1797 */
1798static int intel_pmu_handle_irq(struct pt_regs *regs)
1799{ 1066{
1800 struct perf_sample_data data; 1067 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1801 struct cpu_hw_events *cpuc; 1068 int i;
1802 int bit, loops;
1803 u64 ack, status;
1804
1805 data.addr = 0;
1806 data.raw = NULL;
1807
1808 cpuc = &__get_cpu_var(cpu_hw_events);
1809
1810 perf_disable();
1811 intel_pmu_drain_bts_buffer(cpuc);
1812 status = intel_pmu_get_status();
1813 if (!status) {
1814 perf_enable();
1815 return 0;
1816 }
1817
1818 loops = 0;
1819again:
1820 if (++loops > 100) {
1821 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
1822 perf_event_print_debug();
1823 intel_pmu_reset();
1824 perf_enable();
1825 return 1;
1826 }
1827 1069
1828 inc_irq_stat(apic_perf_irqs); 1070 x86_pmu_stop(event);
1829 ack = status;
1830 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1831 struct perf_event *event = cpuc->events[bit];
1832 1071
1833 clear_bit(bit, (unsigned long *) &status); 1072 for (i = 0; i < cpuc->n_events; i++) {
1834 if (!test_bit(bit, cpuc->active_mask)) 1073 if (event == cpuc->event_list[i]) {
1835 continue;
1836 1074
1837 if (!intel_pmu_save_and_restart(event)) 1075 if (x86_pmu.put_event_constraints)
1838 continue; 1076 x86_pmu.put_event_constraints(cpuc, event);
1839 1077
1840 data.period = event->hw.last_period; 1078 while (++i < cpuc->n_events)
1079 cpuc->event_list[i-1] = cpuc->event_list[i];
1841 1080
1842 if (perf_event_overflow(event, 1, &data, regs)) 1081 --cpuc->n_events;
1843 intel_pmu_disable_event(&event->hw, bit); 1082 break;
1083 }
1844 } 1084 }
1845 1085 perf_event_update_userpage(event);
1846 intel_pmu_ack_status(ack);
1847
1848 /*
1849 * Repeat if there is more work to be done:
1850 */
1851 status = intel_pmu_get_status();
1852 if (status)
1853 goto again;
1854
1855 perf_enable();
1856
1857 return 1;
1858} 1086}
1859 1087
1860static int amd_pmu_handle_irq(struct pt_regs *regs) 1088static int x86_pmu_handle_irq(struct pt_regs *regs)
1861{ 1089{
1862 struct perf_sample_data data; 1090 struct perf_sample_data data;
1863 struct cpu_hw_events *cpuc; 1091 struct cpu_hw_events *cpuc;
@@ -1892,7 +1120,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1892 continue; 1120 continue;
1893 1121
1894 if (perf_event_overflow(event, 1, &data, regs)) 1122 if (perf_event_overflow(event, 1, &data, regs))
1895 amd_pmu_disable_event(hwc, idx); 1123 x86_pmu.disable(hwc, idx);
1896 } 1124 }
1897 1125
1898 if (handled) 1126 if (handled)
@@ -1975,194 +1203,137 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1975 .priority = 1 1203 .priority = 1
1976}; 1204};
1977 1205
1978static __initconst struct x86_pmu p6_pmu = { 1206static struct event_constraint unconstrained;
1979 .name = "p6", 1207static struct event_constraint emptyconstraint;
1980 .handle_irq = p6_pmu_handle_irq,
1981 .disable_all = p6_pmu_disable_all,
1982 .enable_all = p6_pmu_enable_all,
1983 .enable = p6_pmu_enable_event,
1984 .disable = p6_pmu_disable_event,
1985 .eventsel = MSR_P6_EVNTSEL0,
1986 .perfctr = MSR_P6_PERFCTR0,
1987 .event_map = p6_pmu_event_map,
1988 .raw_event = p6_pmu_raw_event,
1989 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
1990 .apic = 1,
1991 .max_period = (1ULL << 31) - 1,
1992 .version = 0,
1993 .num_events = 2,
1994 /*
1995 * Events have 40 bits implemented. However they are designed such
1996 * that bits [32-39] are sign extensions of bit 31. As such the
1997 * effective width of a event for P6-like PMU is 32 bits only.
1998 *
1999 * See IA-32 Intel Architecture Software developer manual Vol 3B
2000 */
2001 .event_bits = 32,
2002 .event_mask = (1ULL << 32) - 1,
2003 .get_event_idx = intel_get_event_idx,
2004};
2005 1208
2006static __initconst struct x86_pmu intel_pmu = { 1209static struct event_constraint *
2007 .name = "Intel", 1210x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
2008 .handle_irq = intel_pmu_handle_irq, 1211{
2009 .disable_all = intel_pmu_disable_all, 1212 struct event_constraint *c;
2010 .enable_all = intel_pmu_enable_all,
2011 .enable = intel_pmu_enable_event,
2012 .disable = intel_pmu_disable_event,
2013 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
2014 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
2015 .event_map = intel_pmu_event_map,
2016 .raw_event = intel_pmu_raw_event,
2017 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
2018 .apic = 1,
2019 /*
2020 * Intel PMCs cannot be accessed sanely above 32 bit width,
2021 * so we install an artificial 1<<31 period regardless of
2022 * the generic event period:
2023 */
2024 .max_period = (1ULL << 31) - 1,
2025 .enable_bts = intel_pmu_enable_bts,
2026 .disable_bts = intel_pmu_disable_bts,
2027 .get_event_idx = intel_get_event_idx,
2028};
2029 1213
2030static __initconst struct x86_pmu amd_pmu = { 1214 if (x86_pmu.event_constraints) {
2031 .name = "AMD", 1215 for_each_event_constraint(c, x86_pmu.event_constraints) {
2032 .handle_irq = amd_pmu_handle_irq, 1216 if ((event->hw.config & c->cmask) == c->code)
2033 .disable_all = amd_pmu_disable_all, 1217 return c;
2034 .enable_all = amd_pmu_enable_all, 1218 }
2035 .enable = amd_pmu_enable_event, 1219 }
2036 .disable = amd_pmu_disable_event, 1220
2037 .eventsel = MSR_K7_EVNTSEL0, 1221 return &unconstrained;
2038 .perfctr = MSR_K7_PERFCTR0, 1222}
2039 .event_map = amd_pmu_event_map,
2040 .raw_event = amd_pmu_raw_event,
2041 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
2042 .num_events = 4,
2043 .event_bits = 48,
2044 .event_mask = (1ULL << 48) - 1,
2045 .apic = 1,
2046 /* use highest bit to detect overflow */
2047 .max_period = (1ULL << 47) - 1,
2048 .get_event_idx = gen_get_event_idx,
2049};
2050 1223
2051static __init int p6_pmu_init(void) 1224static int x86_event_sched_in(struct perf_event *event,
1225 struct perf_cpu_context *cpuctx)
2052{ 1226{
2053 switch (boot_cpu_data.x86_model) { 1227 int ret = 0;
2054 case 1:
2055 case 3: /* Pentium Pro */
2056 case 5:
2057 case 6: /* Pentium II */
2058 case 7:
2059 case 8:
2060 case 11: /* Pentium III */
2061 event_constraints = intel_p6_event_constraints;
2062 break;
2063 case 9:
2064 case 13:
2065 /* Pentium M */
2066 event_constraints = intel_p6_event_constraints;
2067 break;
2068 default:
2069 pr_cont("unsupported p6 CPU model %d ",
2070 boot_cpu_data.x86_model);
2071 return -ENODEV;
2072 }
2073 1228
2074 x86_pmu = p6_pmu; 1229 event->state = PERF_EVENT_STATE_ACTIVE;
1230 event->oncpu = smp_processor_id();
1231 event->tstamp_running += event->ctx->time - event->tstamp_stopped;
2075 1232
2076 return 0; 1233 if (!is_x86_event(event))
1234 ret = event->pmu->enable(event);
1235
1236 if (!ret && !is_software_event(event))
1237 cpuctx->active_oncpu++;
1238
1239 if (!ret && event->attr.exclusive)
1240 cpuctx->exclusive = 1;
1241
1242 return ret;
2077} 1243}
2078 1244
2079static __init int intel_pmu_init(void) 1245static void x86_event_sched_out(struct perf_event *event,
1246 struct perf_cpu_context *cpuctx)
2080{ 1247{
2081 union cpuid10_edx edx; 1248 event->state = PERF_EVENT_STATE_INACTIVE;
2082 union cpuid10_eax eax; 1249 event->oncpu = -1;
2083 unsigned int unused;
2084 unsigned int ebx;
2085 int version;
2086
2087 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
2088 /* check for P6 processor family */
2089 if (boot_cpu_data.x86 == 6) {
2090 return p6_pmu_init();
2091 } else {
2092 return -ENODEV;
2093 }
2094 }
2095 1250
2096 /* 1251 if (!is_x86_event(event))
2097 * Check whether the Architectural PerfMon supports 1252 event->pmu->disable(event);
2098 * Branch Misses Retired hw_event or not.
2099 */
2100 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
2101 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
2102 return -ENODEV;
2103 1253
2104 version = eax.split.version_id; 1254 event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
2105 if (version < 2)
2106 return -ENODEV;
2107 1255
2108 x86_pmu = intel_pmu; 1256 if (!is_software_event(event))
2109 x86_pmu.version = version; 1257 cpuctx->active_oncpu--;
2110 x86_pmu.num_events = eax.split.num_events;
2111 x86_pmu.event_bits = eax.split.bit_width;
2112 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
2113 1258
2114 /* 1259 if (event->attr.exclusive || !cpuctx->active_oncpu)
2115 * Quirk: v2 perfmon does not report fixed-purpose events, so 1260 cpuctx->exclusive = 0;
2116 * assume at least 3 events: 1261}
2117 */
2118 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
2119 1262
1263/*
1264 * Called to enable a whole group of events.
1265 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
1266 * Assumes the caller has disabled interrupts and has
1267 * frozen the PMU with hw_perf_save_disable.
1268 *
1269 * called with PMU disabled. If successful and return value 1,
1270 * then guaranteed to call perf_enable() and hw_perf_enable()
1271 */
1272int hw_perf_group_sched_in(struct perf_event *leader,
1273 struct perf_cpu_context *cpuctx,
1274 struct perf_event_context *ctx)
1275{
1276 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1277 struct perf_event *sub;
1278 int assign[X86_PMC_IDX_MAX];
1279 int n0, n1, ret;
1280
1281 /* n0 = total number of events */
1282 n0 = collect_events(cpuc, leader, true);
1283 if (n0 < 0)
1284 return n0;
1285
1286 ret = x86_schedule_events(cpuc, n0, assign);
1287 if (ret)
1288 return ret;
1289
1290 ret = x86_event_sched_in(leader, cpuctx);
1291 if (ret)
1292 return ret;
1293
1294 n1 = 1;
1295 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1296 if (sub->state > PERF_EVENT_STATE_OFF) {
1297 ret = x86_event_sched_in(sub, cpuctx);
1298 if (ret)
1299 goto undo;
1300 ++n1;
1301 }
1302 }
2120 /* 1303 /*
2121 * Install the hw-cache-events table: 1304 * copy new assignment, now we know it is possible
1305 * will be used by hw_perf_enable()
2122 */ 1306 */
2123 switch (boot_cpu_data.x86_model) { 1307 memcpy(cpuc->assign, assign, n0*sizeof(int));
2124 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
2125 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
2126 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
2127 case 29: /* six-core 45 nm xeon "Dunnington" */
2128 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
2129 sizeof(hw_cache_event_ids));
2130
2131 pr_cont("Core2 events, ");
2132 event_constraints = intel_core_event_constraints;
2133 break;
2134 default:
2135 case 26:
2136 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
2137 sizeof(hw_cache_event_ids));
2138 1308
2139 event_constraints = intel_nehalem_event_constraints; 1309 cpuc->n_events = n0;
2140 pr_cont("Nehalem/Corei7 events, "); 1310 cpuc->n_added = n1;
2141 break; 1311 ctx->nr_active += n1;
2142 case 28:
2143 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
2144 sizeof(hw_cache_event_ids));
2145 1312
2146 pr_cont("Atom events, "); 1313 /*
2147 break; 1314 * 1 means successful and events are active
1315 * This is not quite true because we defer
1316 * actual activation until hw_perf_enable() but
1317 * this way we* ensure caller won't try to enable
1318 * individual events
1319 */
1320 return 1;
1321undo:
1322 x86_event_sched_out(leader, cpuctx);
1323 n0 = 1;
1324 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1325 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
1326 x86_event_sched_out(sub, cpuctx);
1327 if (++n0 == n1)
1328 break;
1329 }
2148 } 1330 }
2149 return 0; 1331 return ret;
2150} 1332}
2151 1333
2152static __init int amd_pmu_init(void) 1334#include "perf_event_amd.c"
2153{ 1335#include "perf_event_p6.c"
2154 /* Performance-monitoring supported from K7 and later: */ 1336#include "perf_event_intel.c"
2155 if (boot_cpu_data.x86 < 6)
2156 return -ENODEV;
2157
2158 x86_pmu = amd_pmu;
2159
2160 /* Events are common for all AMDs */
2161 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
2162 sizeof(hw_cache_event_ids));
2163
2164 return 0;
2165}
2166 1337
2167static void __init pmu_check_apic(void) 1338static void __init pmu_check_apic(void)
2168{ 1339{
@@ -2220,6 +1391,10 @@ void __init init_hw_perf_events(void)
2220 perf_events_lapic_init(); 1391 perf_events_lapic_init();
2221 register_die_notifier(&perf_event_nmi_notifier); 1392 register_die_notifier(&perf_event_nmi_notifier);
2222 1393
1394 unconstrained = (struct event_constraint)
1395 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
1396 0, x86_pmu.num_events);
1397
2223 pr_info("... version: %d\n", x86_pmu.version); 1398 pr_info("... version: %d\n", x86_pmu.version);
2224 pr_info("... bit width: %d\n", x86_pmu.event_bits); 1399 pr_info("... bit width: %d\n", x86_pmu.event_bits);
2225 pr_info("... generic registers: %d\n", x86_pmu.num_events); 1400 pr_info("... generic registers: %d\n", x86_pmu.num_events);
@@ -2237,50 +1412,79 @@ static inline void x86_pmu_read(struct perf_event *event)
2237static const struct pmu pmu = { 1412static const struct pmu pmu = {
2238 .enable = x86_pmu_enable, 1413 .enable = x86_pmu_enable,
2239 .disable = x86_pmu_disable, 1414 .disable = x86_pmu_disable,
1415 .start = x86_pmu_start,
1416 .stop = x86_pmu_stop,
2240 .read = x86_pmu_read, 1417 .read = x86_pmu_read,
2241 .unthrottle = x86_pmu_unthrottle, 1418 .unthrottle = x86_pmu_unthrottle,
2242}; 1419};
2243 1420
2244static int 1421/*
2245validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) 1422 * validate a single event group
2246{ 1423 *
2247 struct hw_perf_event fake_event = event->hw; 1424 * validation include:
2248 1425 * - check events are compatible which each other
2249 if (event->pmu && event->pmu != &pmu) 1426 * - events do not compete for the same counter
2250 return 0; 1427 * - number of events <= number of counters
2251 1428 *
2252 return x86_schedule_event(cpuc, &fake_event) >= 0; 1429 * validation ensures the group can be loaded onto the
2253} 1430 * PMU if it was the only group available.
2254 1431 */
2255static int validate_group(struct perf_event *event) 1432static int validate_group(struct perf_event *event)
2256{ 1433{
2257 struct perf_event *sibling, *leader = event->group_leader; 1434 struct perf_event *leader = event->group_leader;
2258 struct cpu_hw_events fake_pmu; 1435 struct cpu_hw_events *fake_cpuc;
1436 int ret, n;
2259 1437
2260 memset(&fake_pmu, 0, sizeof(fake_pmu)); 1438 ret = -ENOMEM;
1439 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1440 if (!fake_cpuc)
1441 goto out;
1442
1443 /*
1444 * the event is not yet connected with its
1445 * siblings therefore we must first collect
1446 * existing siblings, then add the new event
1447 * before we can simulate the scheduling
1448 */
1449 ret = -ENOSPC;
1450 n = collect_events(fake_cpuc, leader, true);
1451 if (n < 0)
1452 goto out_free;
2261 1453
2262 if (!validate_event(&fake_pmu, leader)) 1454 fake_cpuc->n_events = n;
2263 return -ENOSPC; 1455 n = collect_events(fake_cpuc, event, false);
1456 if (n < 0)
1457 goto out_free;
2264 1458
2265 list_for_each_entry(sibling, &leader->sibling_list, group_entry) { 1459 fake_cpuc->n_events = n;
2266 if (!validate_event(&fake_pmu, sibling))
2267 return -ENOSPC;
2268 }
2269 1460
2270 if (!validate_event(&fake_pmu, event)) 1461 ret = x86_schedule_events(fake_cpuc, n, NULL);
2271 return -ENOSPC;
2272 1462
2273 return 0; 1463out_free:
1464 kfree(fake_cpuc);
1465out:
1466 return ret;
2274} 1467}
2275 1468
2276const struct pmu *hw_perf_event_init(struct perf_event *event) 1469const struct pmu *hw_perf_event_init(struct perf_event *event)
2277{ 1470{
1471 const struct pmu *tmp;
2278 int err; 1472 int err;
2279 1473
2280 err = __hw_perf_event_init(event); 1474 err = __hw_perf_event_init(event);
2281 if (!err) { 1475 if (!err) {
1476 /*
1477 * we temporarily connect event to its pmu
1478 * such that validate_group() can classify
1479 * it as an x86 event using is_x86_event()
1480 */
1481 tmp = event->pmu;
1482 event->pmu = &pmu;
1483
2282 if (event->group_leader != event) 1484 if (event->group_leader != event)
2283 err = validate_group(event); 1485 err = validate_group(event);
1486
1487 event->pmu = tmp;
2284 } 1488 }
2285 if (err) { 1489 if (err) {
2286 if (event->destroy) 1490 if (event->destroy)
@@ -2304,7 +1508,6 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
2304 1508
2305static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); 1509static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
2306static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); 1510static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
2307static DEFINE_PER_CPU(int, in_ignored_frame);
2308 1511
2309 1512
2310static void 1513static void
@@ -2320,10 +1523,6 @@ static void backtrace_warning(void *data, char *msg)
2320 1523
2321static int backtrace_stack(void *data, char *name) 1524static int backtrace_stack(void *data, char *name)
2322{ 1525{
2323 per_cpu(in_ignored_frame, smp_processor_id()) =
2324 x86_is_stack_id(NMI_STACK, name) ||
2325 x86_is_stack_id(DEBUG_STACK, name);
2326
2327 return 0; 1526 return 0;
2328} 1527}
2329 1528
@@ -2331,9 +1530,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
2331{ 1530{
2332 struct perf_callchain_entry *entry = data; 1531 struct perf_callchain_entry *entry = data;
2333 1532
2334 if (per_cpu(in_ignored_frame, smp_processor_id()))
2335 return;
2336
2337 if (reliable) 1533 if (reliable)
2338 callchain_store(entry, addr); 1534 callchain_store(entry, addr);
2339} 1535}
@@ -2440,9 +1636,6 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
2440 1636
2441 is_user = user_mode(regs); 1637 is_user = user_mode(regs);
2442 1638
2443 if (!current || current->pid == 0)
2444 return;
2445
2446 if (is_user && current->state != TASK_RUNNING) 1639 if (is_user && current->state != TASK_RUNNING)
2447 return; 1640 return;
2448 1641
@@ -2472,4 +1665,25 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2472void hw_perf_event_setup_online(int cpu) 1665void hw_perf_event_setup_online(int cpu)
2473{ 1666{
2474 init_debug_store_on_cpu(cpu); 1667 init_debug_store_on_cpu(cpu);
1668
1669 switch (boot_cpu_data.x86_vendor) {
1670 case X86_VENDOR_AMD:
1671 amd_pmu_cpu_online(cpu);
1672 break;
1673 default:
1674 return;
1675 }
1676}
1677
1678void hw_perf_event_setup_offline(int cpu)
1679{
1680 init_debug_store_on_cpu(cpu);
1681
1682 switch (boot_cpu_data.x86_vendor) {
1683 case X86_VENDOR_AMD:
1684 amd_pmu_cpu_offline(cpu);
1685 break;
1686 default:
1687 return;
1688 }
2475} 1689}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
new file mode 100644
index 000000000000..8f3dbfda3c4f
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -0,0 +1,416 @@
1#ifdef CONFIG_CPU_SUP_AMD
2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4
5static __initconst u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX]
8 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
9{
10 [ C(L1D) ] = {
11 [ C(OP_READ) ] = {
12 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
13 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
14 },
15 [ C(OP_WRITE) ] = {
16 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
17 [ C(RESULT_MISS) ] = 0,
18 },
19 [ C(OP_PREFETCH) ] = {
20 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
21 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
22 },
23 },
24 [ C(L1I ) ] = {
25 [ C(OP_READ) ] = {
26 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
27 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
28 },
29 [ C(OP_WRITE) ] = {
30 [ C(RESULT_ACCESS) ] = -1,
31 [ C(RESULT_MISS) ] = -1,
32 },
33 [ C(OP_PREFETCH) ] = {
34 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
35 [ C(RESULT_MISS) ] = 0,
36 },
37 },
38 [ C(LL ) ] = {
39 [ C(OP_READ) ] = {
40 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
41 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
42 },
43 [ C(OP_WRITE) ] = {
44 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
45 [ C(RESULT_MISS) ] = 0,
46 },
47 [ C(OP_PREFETCH) ] = {
48 [ C(RESULT_ACCESS) ] = 0,
49 [ C(RESULT_MISS) ] = 0,
50 },
51 },
52 [ C(DTLB) ] = {
53 [ C(OP_READ) ] = {
54 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
55 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
56 },
57 [ C(OP_WRITE) ] = {
58 [ C(RESULT_ACCESS) ] = 0,
59 [ C(RESULT_MISS) ] = 0,
60 },
61 [ C(OP_PREFETCH) ] = {
62 [ C(RESULT_ACCESS) ] = 0,
63 [ C(RESULT_MISS) ] = 0,
64 },
65 },
66 [ C(ITLB) ] = {
67 [ C(OP_READ) ] = {
68 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
69 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
70 },
71 [ C(OP_WRITE) ] = {
72 [ C(RESULT_ACCESS) ] = -1,
73 [ C(RESULT_MISS) ] = -1,
74 },
75 [ C(OP_PREFETCH) ] = {
76 [ C(RESULT_ACCESS) ] = -1,
77 [ C(RESULT_MISS) ] = -1,
78 },
79 },
80 [ C(BPU ) ] = {
81 [ C(OP_READ) ] = {
82 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
83 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
84 },
85 [ C(OP_WRITE) ] = {
86 [ C(RESULT_ACCESS) ] = -1,
87 [ C(RESULT_MISS) ] = -1,
88 },
89 [ C(OP_PREFETCH) ] = {
90 [ C(RESULT_ACCESS) ] = -1,
91 [ C(RESULT_MISS) ] = -1,
92 },
93 },
94};
95
96/*
97 * AMD Performance Monitor K7 and later.
98 */
99static const u64 amd_perfmon_event_map[] =
100{
101 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
102 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
103 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
104 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
105 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
106 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
107};
108
109static u64 amd_pmu_event_map(int hw_event)
110{
111 return amd_perfmon_event_map[hw_event];
112}
113
114static u64 amd_pmu_raw_event(u64 hw_event)
115{
116#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL
117#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
118#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
119#define K7_EVNTSEL_INV_MASK 0x000800000ULL
120#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
121
122#define K7_EVNTSEL_MASK \
123 (K7_EVNTSEL_EVENT_MASK | \
124 K7_EVNTSEL_UNIT_MASK | \
125 K7_EVNTSEL_EDGE_MASK | \
126 K7_EVNTSEL_INV_MASK | \
127 K7_EVNTSEL_REG_MASK)
128
129 return hw_event & K7_EVNTSEL_MASK;
130}
131
132/*
133 * AMD64 events are detected based on their event codes.
134 */
135static inline int amd_is_nb_event(struct hw_perf_event *hwc)
136{
137 return (hwc->config & 0xe0) == 0xe0;
138}
139
140static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
141 struct perf_event *event)
142{
143 struct hw_perf_event *hwc = &event->hw;
144 struct amd_nb *nb = cpuc->amd_nb;
145 int i;
146
147 /*
148 * only care about NB events
149 */
150 if (!(nb && amd_is_nb_event(hwc)))
151 return;
152
153 /*
154 * need to scan whole list because event may not have
155 * been assigned during scheduling
156 *
157 * no race condition possible because event can only
158 * be removed on one CPU at a time AND PMU is disabled
159 * when we come here
160 */
161 for (i = 0; i < x86_pmu.num_events; i++) {
162 if (nb->owners[i] == event) {
163 cmpxchg(nb->owners+i, event, NULL);
164 break;
165 }
166 }
167}
168
169 /*
170 * AMD64 NorthBridge events need special treatment because
171 * counter access needs to be synchronized across all cores
172 * of a package. Refer to BKDG section 3.12
173 *
174 * NB events are events measuring L3 cache, Hypertransport
175 * traffic. They are identified by an event code >= 0xe00.
176 * They measure events on the NorthBride which is shared
177 * by all cores on a package. NB events are counted on a
178 * shared set of counters. When a NB event is programmed
179 * in a counter, the data actually comes from a shared
180 * counter. Thus, access to those counters needs to be
181 * synchronized.
182 *
183 * We implement the synchronization such that no two cores
184 * can be measuring NB events using the same counters. Thus,
185 * we maintain a per-NB allocation table. The available slot
186 * is propagated using the event_constraint structure.
187 *
188 * We provide only one choice for each NB event based on
189 * the fact that only NB events have restrictions. Consequently,
190 * if a counter is available, there is a guarantee the NB event
191 * will be assigned to it. If no slot is available, an empty
192 * constraint is returned and scheduling will eventually fail
193 * for this event.
194 *
195 * Note that all cores attached the same NB compete for the same
196 * counters to host NB events, this is why we use atomic ops. Some
197 * multi-chip CPUs may have more than one NB.
198 *
199 * Given that resources are allocated (cmpxchg), they must be
200 * eventually freed for others to use. This is accomplished by
201 * calling amd_put_event_constraints().
202 *
203 * Non NB events are not impacted by this restriction.
204 */
205static struct event_constraint *
206amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
207{
208 struct hw_perf_event *hwc = &event->hw;
209 struct amd_nb *nb = cpuc->amd_nb;
210 struct perf_event *old = NULL;
211 int max = x86_pmu.num_events;
212 int i, j, k = -1;
213
214 /*
215 * if not NB event or no NB, then no constraints
216 */
217 if (!(nb && amd_is_nb_event(hwc)))
218 return &unconstrained;
219
220 /*
221 * detect if already present, if so reuse
222 *
223 * cannot merge with actual allocation
224 * because of possible holes
225 *
226 * event can already be present yet not assigned (in hwc->idx)
227 * because of successive calls to x86_schedule_events() from
228 * hw_perf_group_sched_in() without hw_perf_enable()
229 */
230 for (i = 0; i < max; i++) {
231 /*
232 * keep track of first free slot
233 */
234 if (k == -1 && !nb->owners[i])
235 k = i;
236
237 /* already present, reuse */
238 if (nb->owners[i] == event)
239 goto done;
240 }
241 /*
242 * not present, so grab a new slot
243 * starting either at:
244 */
245 if (hwc->idx != -1) {
246 /* previous assignment */
247 i = hwc->idx;
248 } else if (k != -1) {
249 /* start from free slot found */
250 i = k;
251 } else {
252 /*
253 * event not found, no slot found in
254 * first pass, try again from the
255 * beginning
256 */
257 i = 0;
258 }
259 j = i;
260 do {
261 old = cmpxchg(nb->owners+i, NULL, event);
262 if (!old)
263 break;
264 if (++i == max)
265 i = 0;
266 } while (i != j);
267done:
268 if (!old)
269 return &nb->event_constraints[i];
270
271 return &emptyconstraint;
272}
273
274static __initconst struct x86_pmu amd_pmu = {
275 .name = "AMD",
276 .handle_irq = x86_pmu_handle_irq,
277 .disable_all = x86_pmu_disable_all,
278 .enable_all = x86_pmu_enable_all,
279 .enable = x86_pmu_enable_event,
280 .disable = x86_pmu_disable_event,
281 .eventsel = MSR_K7_EVNTSEL0,
282 .perfctr = MSR_K7_PERFCTR0,
283 .event_map = amd_pmu_event_map,
284 .raw_event = amd_pmu_raw_event,
285 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
286 .num_events = 4,
287 .event_bits = 48,
288 .event_mask = (1ULL << 48) - 1,
289 .apic = 1,
290 /* use highest bit to detect overflow */
291 .max_period = (1ULL << 47) - 1,
292 .get_event_constraints = amd_get_event_constraints,
293 .put_event_constraints = amd_put_event_constraints
294};
295
296static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
297{
298 struct amd_nb *nb;
299 int i;
300
301 nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL);
302 if (!nb)
303 return NULL;
304
305 memset(nb, 0, sizeof(*nb));
306 nb->nb_id = nb_id;
307
308 /*
309 * initialize all possible NB constraints
310 */
311 for (i = 0; i < x86_pmu.num_events; i++) {
312 set_bit(i, nb->event_constraints[i].idxmsk);
313 nb->event_constraints[i].weight = 1;
314 }
315 return nb;
316}
317
318static void amd_pmu_cpu_online(int cpu)
319{
320 struct cpu_hw_events *cpu1, *cpu2;
321 struct amd_nb *nb = NULL;
322 int i, nb_id;
323
324 if (boot_cpu_data.x86_max_cores < 2)
325 return;
326
327 /*
328 * function may be called too early in the
329 * boot process, in which case nb_id is bogus
330 */
331 nb_id = amd_get_nb_id(cpu);
332 if (nb_id == BAD_APICID)
333 return;
334
335 cpu1 = &per_cpu(cpu_hw_events, cpu);
336 cpu1->amd_nb = NULL;
337
338 raw_spin_lock(&amd_nb_lock);
339
340 for_each_online_cpu(i) {
341 cpu2 = &per_cpu(cpu_hw_events, i);
342 nb = cpu2->amd_nb;
343 if (!nb)
344 continue;
345 if (nb->nb_id == nb_id)
346 goto found;
347 }
348
349 nb = amd_alloc_nb(cpu, nb_id);
350 if (!nb) {
351 pr_err("perf_events: failed NB allocation for CPU%d\n", cpu);
352 raw_spin_unlock(&amd_nb_lock);
353 return;
354 }
355found:
356 nb->refcnt++;
357 cpu1->amd_nb = nb;
358
359 raw_spin_unlock(&amd_nb_lock);
360}
361
362static void amd_pmu_cpu_offline(int cpu)
363{
364 struct cpu_hw_events *cpuhw;
365
366 if (boot_cpu_data.x86_max_cores < 2)
367 return;
368
369 cpuhw = &per_cpu(cpu_hw_events, cpu);
370
371 raw_spin_lock(&amd_nb_lock);
372
373 if (--cpuhw->amd_nb->refcnt == 0)
374 kfree(cpuhw->amd_nb);
375
376 cpuhw->amd_nb = NULL;
377
378 raw_spin_unlock(&amd_nb_lock);
379}
380
381static __init int amd_pmu_init(void)
382{
383 /* Performance-monitoring supported from K7 and later: */
384 if (boot_cpu_data.x86 < 6)
385 return -ENODEV;
386
387 x86_pmu = amd_pmu;
388
389 /* Events are common for all AMDs */
390 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
391 sizeof(hw_cache_event_ids));
392
393 /*
394 * explicitly initialize the boot cpu, other cpus will get
395 * the cpu hotplug callbacks from smp_init()
396 */
397 amd_pmu_cpu_online(smp_processor_id());
398 return 0;
399}
400
401#else /* CONFIG_CPU_SUP_AMD */
402
403static int amd_pmu_init(void)
404{
405 return 0;
406}
407
408static void amd_pmu_cpu_online(int cpu)
409{
410}
411
412static void amd_pmu_cpu_offline(int cpu)
413{
414}
415
416#endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
new file mode 100644
index 000000000000..977e7544738c
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -0,0 +1,971 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/*
4 * Intel PerfMon v3. Used on Core2 and later.
5 */
6static const u64 intel_perfmon_event_map[] =
7{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
10 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
11 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
12 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
13 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
14 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
15};
16
17static struct event_constraint intel_core_event_constraints[] =
18{
19 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
20 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
21 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
22 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
23 INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
24 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
25 EVENT_CONSTRAINT_END
26};
27
28static struct event_constraint intel_core2_event_constraints[] =
29{
30 FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */
31 FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */
32 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
33 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
34 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
35 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
36 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
37 INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
38 INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
39 INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
40 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
41 EVENT_CONSTRAINT_END
42};
43
44static struct event_constraint intel_nehalem_event_constraints[] =
45{
46 FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */
47 FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */
48 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
49 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
50 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
51 INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
52 INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
53 INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
54 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
55 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
56 EVENT_CONSTRAINT_END
57};
58
59static struct event_constraint intel_westmere_event_constraints[] =
60{
61 FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */
62 FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */
63 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
64 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
65 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
66 EVENT_CONSTRAINT_END
67};
68
69static struct event_constraint intel_gen_event_constraints[] =
70{
71 FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */
72 FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */
73 EVENT_CONSTRAINT_END
74};
75
76static u64 intel_pmu_event_map(int hw_event)
77{
78 return intel_perfmon_event_map[hw_event];
79}
80
81static __initconst u64 westmere_hw_cache_event_ids
82 [PERF_COUNT_HW_CACHE_MAX]
83 [PERF_COUNT_HW_CACHE_OP_MAX]
84 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
85{
86 [ C(L1D) ] = {
87 [ C(OP_READ) ] = {
88 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
89 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
90 },
91 [ C(OP_WRITE) ] = {
92 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
93 [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
94 },
95 [ C(OP_PREFETCH) ] = {
96 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
97 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
98 },
99 },
100 [ C(L1I ) ] = {
101 [ C(OP_READ) ] = {
102 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
103 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
104 },
105 [ C(OP_WRITE) ] = {
106 [ C(RESULT_ACCESS) ] = -1,
107 [ C(RESULT_MISS) ] = -1,
108 },
109 [ C(OP_PREFETCH) ] = {
110 [ C(RESULT_ACCESS) ] = 0x0,
111 [ C(RESULT_MISS) ] = 0x0,
112 },
113 },
114 [ C(LL ) ] = {
115 [ C(OP_READ) ] = {
116 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
117 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
118 },
119 [ C(OP_WRITE) ] = {
120 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
121 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
122 },
123 [ C(OP_PREFETCH) ] = {
124 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
125 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
126 },
127 },
128 [ C(DTLB) ] = {
129 [ C(OP_READ) ] = {
130 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
131 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
132 },
133 [ C(OP_WRITE) ] = {
134 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
135 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
136 },
137 [ C(OP_PREFETCH) ] = {
138 [ C(RESULT_ACCESS) ] = 0x0,
139 [ C(RESULT_MISS) ] = 0x0,
140 },
141 },
142 [ C(ITLB) ] = {
143 [ C(OP_READ) ] = {
144 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
145 [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */
146 },
147 [ C(OP_WRITE) ] = {
148 [ C(RESULT_ACCESS) ] = -1,
149 [ C(RESULT_MISS) ] = -1,
150 },
151 [ C(OP_PREFETCH) ] = {
152 [ C(RESULT_ACCESS) ] = -1,
153 [ C(RESULT_MISS) ] = -1,
154 },
155 },
156 [ C(BPU ) ] = {
157 [ C(OP_READ) ] = {
158 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
159 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
160 },
161 [ C(OP_WRITE) ] = {
162 [ C(RESULT_ACCESS) ] = -1,
163 [ C(RESULT_MISS) ] = -1,
164 },
165 [ C(OP_PREFETCH) ] = {
166 [ C(RESULT_ACCESS) ] = -1,
167 [ C(RESULT_MISS) ] = -1,
168 },
169 },
170};
171
172static __initconst u64 nehalem_hw_cache_event_ids
173 [PERF_COUNT_HW_CACHE_MAX]
174 [PERF_COUNT_HW_CACHE_OP_MAX]
175 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
176{
177 [ C(L1D) ] = {
178 [ C(OP_READ) ] = {
179 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
180 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
181 },
182 [ C(OP_WRITE) ] = {
183 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
184 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
185 },
186 [ C(OP_PREFETCH) ] = {
187 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
188 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
189 },
190 },
191 [ C(L1I ) ] = {
192 [ C(OP_READ) ] = {
193 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
194 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
195 },
196 [ C(OP_WRITE) ] = {
197 [ C(RESULT_ACCESS) ] = -1,
198 [ C(RESULT_MISS) ] = -1,
199 },
200 [ C(OP_PREFETCH) ] = {
201 [ C(RESULT_ACCESS) ] = 0x0,
202 [ C(RESULT_MISS) ] = 0x0,
203 },
204 },
205 [ C(LL ) ] = {
206 [ C(OP_READ) ] = {
207 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
208 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
209 },
210 [ C(OP_WRITE) ] = {
211 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
212 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
213 },
214 [ C(OP_PREFETCH) ] = {
215 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
216 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
217 },
218 },
219 [ C(DTLB) ] = {
220 [ C(OP_READ) ] = {
221 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
222 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
223 },
224 [ C(OP_WRITE) ] = {
225 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
226 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
227 },
228 [ C(OP_PREFETCH) ] = {
229 [ C(RESULT_ACCESS) ] = 0x0,
230 [ C(RESULT_MISS) ] = 0x0,
231 },
232 },
233 [ C(ITLB) ] = {
234 [ C(OP_READ) ] = {
235 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
236 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
237 },
238 [ C(OP_WRITE) ] = {
239 [ C(RESULT_ACCESS) ] = -1,
240 [ C(RESULT_MISS) ] = -1,
241 },
242 [ C(OP_PREFETCH) ] = {
243 [ C(RESULT_ACCESS) ] = -1,
244 [ C(RESULT_MISS) ] = -1,
245 },
246 },
247 [ C(BPU ) ] = {
248 [ C(OP_READ) ] = {
249 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
250 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
251 },
252 [ C(OP_WRITE) ] = {
253 [ C(RESULT_ACCESS) ] = -1,
254 [ C(RESULT_MISS) ] = -1,
255 },
256 [ C(OP_PREFETCH) ] = {
257 [ C(RESULT_ACCESS) ] = -1,
258 [ C(RESULT_MISS) ] = -1,
259 },
260 },
261};
262
263static __initconst u64 core2_hw_cache_event_ids
264 [PERF_COUNT_HW_CACHE_MAX]
265 [PERF_COUNT_HW_CACHE_OP_MAX]
266 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
267{
268 [ C(L1D) ] = {
269 [ C(OP_READ) ] = {
270 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
271 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
272 },
273 [ C(OP_WRITE) ] = {
274 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
275 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
276 },
277 [ C(OP_PREFETCH) ] = {
278 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
279 [ C(RESULT_MISS) ] = 0,
280 },
281 },
282 [ C(L1I ) ] = {
283 [ C(OP_READ) ] = {
284 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
285 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
286 },
287 [ C(OP_WRITE) ] = {
288 [ C(RESULT_ACCESS) ] = -1,
289 [ C(RESULT_MISS) ] = -1,
290 },
291 [ C(OP_PREFETCH) ] = {
292 [ C(RESULT_ACCESS) ] = 0,
293 [ C(RESULT_MISS) ] = 0,
294 },
295 },
296 [ C(LL ) ] = {
297 [ C(OP_READ) ] = {
298 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
299 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
300 },
301 [ C(OP_WRITE) ] = {
302 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
303 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
304 },
305 [ C(OP_PREFETCH) ] = {
306 [ C(RESULT_ACCESS) ] = 0,
307 [ C(RESULT_MISS) ] = 0,
308 },
309 },
310 [ C(DTLB) ] = {
311 [ C(OP_READ) ] = {
312 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
313 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
314 },
315 [ C(OP_WRITE) ] = {
316 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
317 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
318 },
319 [ C(OP_PREFETCH) ] = {
320 [ C(RESULT_ACCESS) ] = 0,
321 [ C(RESULT_MISS) ] = 0,
322 },
323 },
324 [ C(ITLB) ] = {
325 [ C(OP_READ) ] = {
326 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
327 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
328 },
329 [ C(OP_WRITE) ] = {
330 [ C(RESULT_ACCESS) ] = -1,
331 [ C(RESULT_MISS) ] = -1,
332 },
333 [ C(OP_PREFETCH) ] = {
334 [ C(RESULT_ACCESS) ] = -1,
335 [ C(RESULT_MISS) ] = -1,
336 },
337 },
338 [ C(BPU ) ] = {
339 [ C(OP_READ) ] = {
340 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
341 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
342 },
343 [ C(OP_WRITE) ] = {
344 [ C(RESULT_ACCESS) ] = -1,
345 [ C(RESULT_MISS) ] = -1,
346 },
347 [ C(OP_PREFETCH) ] = {
348 [ C(RESULT_ACCESS) ] = -1,
349 [ C(RESULT_MISS) ] = -1,
350 },
351 },
352};
353
354static __initconst u64 atom_hw_cache_event_ids
355 [PERF_COUNT_HW_CACHE_MAX]
356 [PERF_COUNT_HW_CACHE_OP_MAX]
357 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
358{
359 [ C(L1D) ] = {
360 [ C(OP_READ) ] = {
361 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
362 [ C(RESULT_MISS) ] = 0,
363 },
364 [ C(OP_WRITE) ] = {
365 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
366 [ C(RESULT_MISS) ] = 0,
367 },
368 [ C(OP_PREFETCH) ] = {
369 [ C(RESULT_ACCESS) ] = 0x0,
370 [ C(RESULT_MISS) ] = 0,
371 },
372 },
373 [ C(L1I ) ] = {
374 [ C(OP_READ) ] = {
375 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
376 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
377 },
378 [ C(OP_WRITE) ] = {
379 [ C(RESULT_ACCESS) ] = -1,
380 [ C(RESULT_MISS) ] = -1,
381 },
382 [ C(OP_PREFETCH) ] = {
383 [ C(RESULT_ACCESS) ] = 0,
384 [ C(RESULT_MISS) ] = 0,
385 },
386 },
387 [ C(LL ) ] = {
388 [ C(OP_READ) ] = {
389 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
390 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
391 },
392 [ C(OP_WRITE) ] = {
393 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
394 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
395 },
396 [ C(OP_PREFETCH) ] = {
397 [ C(RESULT_ACCESS) ] = 0,
398 [ C(RESULT_MISS) ] = 0,
399 },
400 },
401 [ C(DTLB) ] = {
402 [ C(OP_READ) ] = {
403 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
404 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
405 },
406 [ C(OP_WRITE) ] = {
407 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
408 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
409 },
410 [ C(OP_PREFETCH) ] = {
411 [ C(RESULT_ACCESS) ] = 0,
412 [ C(RESULT_MISS) ] = 0,
413 },
414 },
415 [ C(ITLB) ] = {
416 [ C(OP_READ) ] = {
417 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
418 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
419 },
420 [ C(OP_WRITE) ] = {
421 [ C(RESULT_ACCESS) ] = -1,
422 [ C(RESULT_MISS) ] = -1,
423 },
424 [ C(OP_PREFETCH) ] = {
425 [ C(RESULT_ACCESS) ] = -1,
426 [ C(RESULT_MISS) ] = -1,
427 },
428 },
429 [ C(BPU ) ] = {
430 [ C(OP_READ) ] = {
431 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
432 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
433 },
434 [ C(OP_WRITE) ] = {
435 [ C(RESULT_ACCESS) ] = -1,
436 [ C(RESULT_MISS) ] = -1,
437 },
438 [ C(OP_PREFETCH) ] = {
439 [ C(RESULT_ACCESS) ] = -1,
440 [ C(RESULT_MISS) ] = -1,
441 },
442 },
443};
444
445static u64 intel_pmu_raw_event(u64 hw_event)
446{
447#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
448#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
449#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
450#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
451#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
452
453#define CORE_EVNTSEL_MASK \
454 (INTEL_ARCH_EVTSEL_MASK | \
455 INTEL_ARCH_UNIT_MASK | \
456 INTEL_ARCH_EDGE_MASK | \
457 INTEL_ARCH_INV_MASK | \
458 INTEL_ARCH_CNT_MASK)
459
460 return hw_event & CORE_EVNTSEL_MASK;
461}
462
463static void intel_pmu_enable_bts(u64 config)
464{
465 unsigned long debugctlmsr;
466
467 debugctlmsr = get_debugctlmsr();
468
469 debugctlmsr |= X86_DEBUGCTL_TR;
470 debugctlmsr |= X86_DEBUGCTL_BTS;
471 debugctlmsr |= X86_DEBUGCTL_BTINT;
472
473 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
474 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
475
476 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
477 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
478
479 update_debugctlmsr(debugctlmsr);
480}
481
482static void intel_pmu_disable_bts(void)
483{
484 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
485 unsigned long debugctlmsr;
486
487 if (!cpuc->ds)
488 return;
489
490 debugctlmsr = get_debugctlmsr();
491
492 debugctlmsr &=
493 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
494 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
495
496 update_debugctlmsr(debugctlmsr);
497}
498
499static void intel_pmu_disable_all(void)
500{
501 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
502
503 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
504
505 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
506 intel_pmu_disable_bts();
507}
508
509static void intel_pmu_enable_all(void)
510{
511 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
512
513 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
514
515 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
516 struct perf_event *event =
517 cpuc->events[X86_PMC_IDX_FIXED_BTS];
518
519 if (WARN_ON_ONCE(!event))
520 return;
521
522 intel_pmu_enable_bts(event->hw.config);
523 }
524}
525
526static inline u64 intel_pmu_get_status(void)
527{
528 u64 status;
529
530 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
531
532 return status;
533}
534
535static inline void intel_pmu_ack_status(u64 ack)
536{
537 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
538}
539
540static inline void
541intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx)
542{
543 int idx = __idx - X86_PMC_IDX_FIXED;
544 u64 ctrl_val, mask;
545
546 mask = 0xfULL << (idx * 4);
547
548 rdmsrl(hwc->config_base, ctrl_val);
549 ctrl_val &= ~mask;
550 (void)checking_wrmsrl(hwc->config_base, ctrl_val);
551}
552
553static void intel_pmu_drain_bts_buffer(void)
554{
555 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
556 struct debug_store *ds = cpuc->ds;
557 struct bts_record {
558 u64 from;
559 u64 to;
560 u64 flags;
561 };
562 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
563 struct bts_record *at, *top;
564 struct perf_output_handle handle;
565 struct perf_event_header header;
566 struct perf_sample_data data;
567 struct pt_regs regs;
568
569 if (!event)
570 return;
571
572 if (!ds)
573 return;
574
575 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
576 top = (struct bts_record *)(unsigned long)ds->bts_index;
577
578 if (top <= at)
579 return;
580
581 ds->bts_index = ds->bts_buffer_base;
582
583
584 data.period = event->hw.last_period;
585 data.addr = 0;
586 data.raw = NULL;
587 regs.ip = 0;
588
589 /*
590 * Prepare a generic sample, i.e. fill in the invariant fields.
591 * We will overwrite the from and to address before we output
592 * the sample.
593 */
594 perf_prepare_sample(&header, &data, event, &regs);
595
596 if (perf_output_begin(&handle, event,
597 header.size * (top - at), 1, 1))
598 return;
599
600 for (; at < top; at++) {
601 data.ip = at->from;
602 data.addr = at->to;
603
604 perf_output_sample(&handle, &header, &data, event);
605 }
606
607 perf_output_end(&handle);
608
609 /* There's new data available. */
610 event->hw.interrupts++;
611 event->pending_kill = POLL_IN;
612}
613
614static inline void
615intel_pmu_disable_event(struct hw_perf_event *hwc, int idx)
616{
617 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
618 intel_pmu_disable_bts();
619 intel_pmu_drain_bts_buffer();
620 return;
621 }
622
623 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
624 intel_pmu_disable_fixed(hwc, idx);
625 return;
626 }
627
628 x86_pmu_disable_event(hwc, idx);
629}
630
631static inline void
632intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
633{
634 int idx = __idx - X86_PMC_IDX_FIXED;
635 u64 ctrl_val, bits, mask;
636 int err;
637
638 /*
639 * Enable IRQ generation (0x8),
640 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
641 * if requested:
642 */
643 bits = 0x8ULL;
644 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
645 bits |= 0x2;
646 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
647 bits |= 0x1;
648
649 /*
650 * ANY bit is supported in v3 and up
651 */
652 if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
653 bits |= 0x4;
654
655 bits <<= (idx * 4);
656 mask = 0xfULL << (idx * 4);
657
658 rdmsrl(hwc->config_base, ctrl_val);
659 ctrl_val &= ~mask;
660 ctrl_val |= bits;
661 err = checking_wrmsrl(hwc->config_base, ctrl_val);
662}
663
664static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
665{
666 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
667 if (!__get_cpu_var(cpu_hw_events).enabled)
668 return;
669
670 intel_pmu_enable_bts(hwc->config);
671 return;
672 }
673
674 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
675 intel_pmu_enable_fixed(hwc, idx);
676 return;
677 }
678
679 __x86_pmu_enable_event(hwc, idx);
680}
681
682/*
683 * Save and restart an expired event. Called by NMI contexts,
684 * so it has to be careful about preempting normal event ops:
685 */
686static int intel_pmu_save_and_restart(struct perf_event *event)
687{
688 struct hw_perf_event *hwc = &event->hw;
689 int idx = hwc->idx;
690 int ret;
691
692 x86_perf_event_update(event, hwc, idx);
693 ret = x86_perf_event_set_period(event, hwc, idx);
694
695 return ret;
696}
697
698static void intel_pmu_reset(void)
699{
700 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
701 unsigned long flags;
702 int idx;
703
704 if (!x86_pmu.num_events)
705 return;
706
707 local_irq_save(flags);
708
709 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
710
711 for (idx = 0; idx < x86_pmu.num_events; idx++) {
712 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
713 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
714 }
715 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
716 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
717 }
718 if (ds)
719 ds->bts_index = ds->bts_buffer_base;
720
721 local_irq_restore(flags);
722}
723
724/*
725 * This handler is triggered by the local APIC, so the APIC IRQ handling
726 * rules apply:
727 */
728static int intel_pmu_handle_irq(struct pt_regs *regs)
729{
730 struct perf_sample_data data;
731 struct cpu_hw_events *cpuc;
732 int bit, loops;
733 u64 ack, status;
734
735 data.addr = 0;
736 data.raw = NULL;
737
738 cpuc = &__get_cpu_var(cpu_hw_events);
739
740 perf_disable();
741 intel_pmu_drain_bts_buffer();
742 status = intel_pmu_get_status();
743 if (!status) {
744 perf_enable();
745 return 0;
746 }
747
748 loops = 0;
749again:
750 if (++loops > 100) {
751 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
752 perf_event_print_debug();
753 intel_pmu_reset();
754 perf_enable();
755 return 1;
756 }
757
758 inc_irq_stat(apic_perf_irqs);
759 ack = status;
760 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
761 struct perf_event *event = cpuc->events[bit];
762
763 clear_bit(bit, (unsigned long *) &status);
764 if (!test_bit(bit, cpuc->active_mask))
765 continue;
766
767 if (!intel_pmu_save_and_restart(event))
768 continue;
769
770 data.period = event->hw.last_period;
771
772 if (perf_event_overflow(event, 1, &data, regs))
773 intel_pmu_disable_event(&event->hw, bit);
774 }
775
776 intel_pmu_ack_status(ack);
777
778 /*
779 * Repeat if there is more work to be done:
780 */
781 status = intel_pmu_get_status();
782 if (status)
783 goto again;
784
785 perf_enable();
786
787 return 1;
788}
789
790static struct event_constraint bts_constraint =
791 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
792
793static struct event_constraint *
794intel_special_constraints(struct perf_event *event)
795{
796 unsigned int hw_event;
797
798 hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
799
800 if (unlikely((hw_event ==
801 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
802 (event->hw.sample_period == 1))) {
803
804 return &bts_constraint;
805 }
806 return NULL;
807}
808
809static struct event_constraint *
810intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
811{
812 struct event_constraint *c;
813
814 c = intel_special_constraints(event);
815 if (c)
816 return c;
817
818 return x86_get_event_constraints(cpuc, event);
819}
820
821static __initconst struct x86_pmu core_pmu = {
822 .name = "core",
823 .handle_irq = x86_pmu_handle_irq,
824 .disable_all = x86_pmu_disable_all,
825 .enable_all = x86_pmu_enable_all,
826 .enable = x86_pmu_enable_event,
827 .disable = x86_pmu_disable_event,
828 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
829 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
830 .event_map = intel_pmu_event_map,
831 .raw_event = intel_pmu_raw_event,
832 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
833 .apic = 1,
834 /*
835 * Intel PMCs cannot be accessed sanely above 32 bit width,
836 * so we install an artificial 1<<31 period regardless of
837 * the generic event period:
838 */
839 .max_period = (1ULL << 31) - 1,
840 .get_event_constraints = intel_get_event_constraints,
841 .event_constraints = intel_core_event_constraints,
842};
843
844static __initconst struct x86_pmu intel_pmu = {
845 .name = "Intel",
846 .handle_irq = intel_pmu_handle_irq,
847 .disable_all = intel_pmu_disable_all,
848 .enable_all = intel_pmu_enable_all,
849 .enable = intel_pmu_enable_event,
850 .disable = intel_pmu_disable_event,
851 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
852 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
853 .event_map = intel_pmu_event_map,
854 .raw_event = intel_pmu_raw_event,
855 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
856 .apic = 1,
857 /*
858 * Intel PMCs cannot be accessed sanely above 32 bit width,
859 * so we install an artificial 1<<31 period regardless of
860 * the generic event period:
861 */
862 .max_period = (1ULL << 31) - 1,
863 .enable_bts = intel_pmu_enable_bts,
864 .disable_bts = intel_pmu_disable_bts,
865 .get_event_constraints = intel_get_event_constraints
866};
867
868static __init int intel_pmu_init(void)
869{
870 union cpuid10_edx edx;
871 union cpuid10_eax eax;
872 unsigned int unused;
873 unsigned int ebx;
874 int version;
875
876 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
877 /* check for P6 processor family */
878 if (boot_cpu_data.x86 == 6) {
879 return p6_pmu_init();
880 } else {
881 return -ENODEV;
882 }
883 }
884
885 /*
886 * Check whether the Architectural PerfMon supports
887 * Branch Misses Retired hw_event or not.
888 */
889 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
890 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
891 return -ENODEV;
892
893 version = eax.split.version_id;
894 if (version < 2)
895 x86_pmu = core_pmu;
896 else
897 x86_pmu = intel_pmu;
898
899 x86_pmu.version = version;
900 x86_pmu.num_events = eax.split.num_events;
901 x86_pmu.event_bits = eax.split.bit_width;
902 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
903
904 /*
905 * Quirk: v2 perfmon does not report fixed-purpose events, so
906 * assume at least 3 events:
907 */
908 if (version > 1)
909 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
910
911 /*
912 * Install the hw-cache-events table:
913 */
914 switch (boot_cpu_data.x86_model) {
915 case 14: /* 65 nm core solo/duo, "Yonah" */
916 pr_cont("Core events, ");
917 break;
918
919 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
920 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
921 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
922 case 29: /* six-core 45 nm xeon "Dunnington" */
923 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
924 sizeof(hw_cache_event_ids));
925
926 x86_pmu.event_constraints = intel_core2_event_constraints;
927 pr_cont("Core2 events, ");
928 break;
929
930 case 26: /* 45 nm nehalem, "Bloomfield" */
931 case 30: /* 45 nm nehalem, "Lynnfield" */
932 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
933 sizeof(hw_cache_event_ids));
934
935 x86_pmu.event_constraints = intel_nehalem_event_constraints;
936 pr_cont("Nehalem/Corei7 events, ");
937 break;
938 case 28:
939 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
940 sizeof(hw_cache_event_ids));
941
942 x86_pmu.event_constraints = intel_gen_event_constraints;
943 pr_cont("Atom events, ");
944 break;
945
946 case 37: /* 32 nm nehalem, "Clarkdale" */
947 case 44: /* 32 nm nehalem, "Gulftown" */
948 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
949 sizeof(hw_cache_event_ids));
950
951 x86_pmu.event_constraints = intel_westmere_event_constraints;
952 pr_cont("Westmere events, ");
953 break;
954 default:
955 /*
956 * default constraints for v2 and up
957 */
958 x86_pmu.event_constraints = intel_gen_event_constraints;
959 pr_cont("generic architected perfmon, ");
960 }
961 return 0;
962}
963
964#else /* CONFIG_CPU_SUP_INTEL */
965
966static int intel_pmu_init(void)
967{
968 return 0;
969}
970
971#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
new file mode 100644
index 000000000000..1ca5ba078afd
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -0,0 +1,157 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/*
4 * Not sure about some of these
5 */
6static const u64 p6_perfmon_event_map[] =
7{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
10 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
11 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
12 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
13 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
14 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
15};
16
17static u64 p6_pmu_event_map(int hw_event)
18{
19 return p6_perfmon_event_map[hw_event];
20}
21
22/*
23 * Event setting that is specified not to count anything.
24 * We use this to effectively disable a counter.
25 *
26 * L2_RQSTS with 0 MESI unit mask.
27 */
28#define P6_NOP_EVENT 0x0000002EULL
29
30static u64 p6_pmu_raw_event(u64 hw_event)
31{
32#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
33#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
34#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
35#define P6_EVNTSEL_INV_MASK 0x00800000ULL
36#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
37
38#define P6_EVNTSEL_MASK \
39 (P6_EVNTSEL_EVENT_MASK | \
40 P6_EVNTSEL_UNIT_MASK | \
41 P6_EVNTSEL_EDGE_MASK | \
42 P6_EVNTSEL_INV_MASK | \
43 P6_EVNTSEL_REG_MASK)
44
45 return hw_event & P6_EVNTSEL_MASK;
46}
47
48static struct event_constraint p6_event_constraints[] =
49{
50 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
51 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
52 INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
53 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
54 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
55 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
56 EVENT_CONSTRAINT_END
57};
58
59static void p6_pmu_disable_all(void)
60{
61 u64 val;
62
63 /* p6 only has one enable register */
64 rdmsrl(MSR_P6_EVNTSEL0, val);
65 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
66 wrmsrl(MSR_P6_EVNTSEL0, val);
67}
68
69static void p6_pmu_enable_all(void)
70{
71 unsigned long val;
72
73 /* p6 only has one enable register */
74 rdmsrl(MSR_P6_EVNTSEL0, val);
75 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
76 wrmsrl(MSR_P6_EVNTSEL0, val);
77}
78
79static inline void
80p6_pmu_disable_event(struct hw_perf_event *hwc, int idx)
81{
82 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
83 u64 val = P6_NOP_EVENT;
84
85 if (cpuc->enabled)
86 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
87
88 (void)checking_wrmsrl(hwc->config_base + idx, val);
89}
90
91static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
92{
93 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
94 u64 val;
95
96 val = hwc->config;
97 if (cpuc->enabled)
98 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
99
100 (void)checking_wrmsrl(hwc->config_base + idx, val);
101}
102
103static __initconst struct x86_pmu p6_pmu = {
104 .name = "p6",
105 .handle_irq = x86_pmu_handle_irq,
106 .disable_all = p6_pmu_disable_all,
107 .enable_all = p6_pmu_enable_all,
108 .enable = p6_pmu_enable_event,
109 .disable = p6_pmu_disable_event,
110 .eventsel = MSR_P6_EVNTSEL0,
111 .perfctr = MSR_P6_PERFCTR0,
112 .event_map = p6_pmu_event_map,
113 .raw_event = p6_pmu_raw_event,
114 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
115 .apic = 1,
116 .max_period = (1ULL << 31) - 1,
117 .version = 0,
118 .num_events = 2,
119 /*
120 * Events have 40 bits implemented. However they are designed such
121 * that bits [32-39] are sign extensions of bit 31. As such the
122 * effective width of a event for P6-like PMU is 32 bits only.
123 *
124 * See IA-32 Intel Architecture Software developer manual Vol 3B
125 */
126 .event_bits = 32,
127 .event_mask = (1ULL << 32) - 1,
128 .get_event_constraints = x86_get_event_constraints,
129 .event_constraints = p6_event_constraints,
130};
131
132static __init int p6_pmu_init(void)
133{
134 switch (boot_cpu_data.x86_model) {
135 case 1:
136 case 3: /* Pentium Pro */
137 case 5:
138 case 6: /* Pentium II */
139 case 7:
140 case 8:
141 case 11: /* Pentium III */
142 case 9:
143 case 13:
144 /* Pentium M */
145 break;
146 default:
147 pr_cont("unsupported p6 CPU model %d ",
148 boot_cpu_data.x86_model);
149 return -ENODEV;
150 }
151
152 x86_pmu = p6_pmu;
153
154 return 0;
155}
156
157#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 898df9719afb..74f4e85a5727 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -115,17 +115,6 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
115 115
116 return !test_bit(counter, perfctr_nmi_owner); 116 return !test_bit(counter, perfctr_nmi_owner);
117} 117}
118
119/* checks the an msr for availability */
120int avail_to_resrv_perfctr_nmi(unsigned int msr)
121{
122 unsigned int counter;
123
124 counter = nmi_perfctr_msr_to_bit(msr);
125 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
126
127 return !test_bit(counter, perfctr_nmi_owner);
128}
129EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); 118EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
130 119
131int reserve_perfctr_nmi(unsigned int msr) 120int reserve_perfctr_nmi(unsigned int msr)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index ae775ca47b25..11540a189d93 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -18,11 +18,6 @@
18 18
19#include "dumpstack.h" 19#include "dumpstack.h"
20 20
21/* Just a stub for now */
22int x86_is_stack_id(int id, char *name)
23{
24 return 0;
25}
26 21
27void dump_trace(struct task_struct *task, struct pt_regs *regs, 22void dump_trace(struct task_struct *task, struct pt_regs *regs,
28 unsigned long *stack, unsigned long bp, 23 unsigned long *stack, unsigned long bp,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 0ad9597073f5..dce99abb4496 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -33,11 +33,6 @@ static char x86_stack_ids[][8] = {
33#endif 33#endif
34}; 34};
35 35
36int x86_is_stack_id(int id, char *name)
37{
38 return x86_stack_ids[id - 1] == name;
39}
40
41static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 36static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
42 unsigned *usedp, char **idp) 37 unsigned *usedp, char **idp)
43{ 38{
@@ -291,6 +286,7 @@ void show_registers(struct pt_regs *regs)
291 286
292 sp = regs->sp; 287 sp = regs->sp;
293 printk("CPU %d ", cpu); 288 printk("CPU %d ", cpu);
289 print_modules();
294 __show_regs(regs, 1); 290 __show_regs(regs, 1);
295 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 291 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
296 cur->comm, cur->pid, task_thread_info(cur), cur); 292 cur->comm, cur->pid, task_thread_info(cur), cur);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a1a7876cadcb..740b440fbd73 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -12,21 +12,13 @@
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/ioport.h>
16#include <linux/string.h>
17#include <linux/kexec.h>
18#include <linux/module.h>
19#include <linux/mm.h>
20#include <linux/pfn.h> 15#include <linux/pfn.h>
21#include <linux/suspend.h> 16#include <linux/suspend.h>
22#include <linux/firmware-map.h> 17#include <linux/firmware-map.h>
23 18
24#include <asm/pgtable.h>
25#include <asm/page.h>
26#include <asm/e820.h> 19#include <asm/e820.h>
27#include <asm/proto.h> 20#include <asm/proto.h>
28#include <asm/setup.h> 21#include <asm/setup.h>
29#include <asm/trampoline.h>
30 22
31/* 23/*
32 * The e820 map is the map that gets modified e.g. with command line parameters 24 * The e820 map is the map that gets modified e.g. with command line parameters
@@ -517,11 +509,19 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
517 int checktype) 509 int checktype)
518{ 510{
519 int i; 511 int i;
512 u64 end;
520 u64 real_removed_size = 0; 513 u64 real_removed_size = 0;
521 514
522 if (size > (ULLONG_MAX - start)) 515 if (size > (ULLONG_MAX - start))
523 size = ULLONG_MAX - start; 516 size = ULLONG_MAX - start;
524 517
518 end = start + size;
519 printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
520 (unsigned long long) start,
521 (unsigned long long) end);
522 e820_print_type(old_type);
523 printk(KERN_CONT "\n");
524
525 for (i = 0; i < e820.nr_map; i++) { 525 for (i = 0; i < e820.nr_map; i++) {
526 struct e820entry *ei = &e820.map[i]; 526 struct e820entry *ei = &e820.map[i];
527 u64 final_start, final_end; 527 u64 final_start, final_end;
@@ -722,319 +722,44 @@ core_initcall(e820_mark_nvs_memory);
722#endif 722#endif
723 723
724/* 724/*
725 * Early reserved memory areas. 725 * Find a free area with specified alignment in a specific range.
726 */
727#define MAX_EARLY_RES 32
728
729struct early_res {
730 u64 start, end;
731 char name[16];
732 char overlap_ok;
733};
734static struct early_res early_res[MAX_EARLY_RES] __initdata = {
735 { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */
736#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE)
737 /*
738 * But first pinch a few for the stack/trampoline stuff
739 * FIXME: Don't need the extra page at 4K, but need to fix
740 * trampoline before removing it. (see the GDT stuff)
741 */
742 { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 },
743#endif
744
745 {}
746};
747
748static int __init find_overlapped_early(u64 start, u64 end)
749{
750 int i;
751 struct early_res *r;
752
753 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
754 r = &early_res[i];
755 if (end > r->start && start < r->end)
756 break;
757 }
758
759 return i;
760}
761
762/*
763 * Drop the i-th range from the early reservation map,
764 * by copying any higher ranges down one over it, and
765 * clearing what had been the last slot.
766 */
767static void __init drop_range(int i)
768{
769 int j;
770
771 for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
772 ;
773
774 memmove(&early_res[i], &early_res[i + 1],
775 (j - 1 - i) * sizeof(struct early_res));
776
777 early_res[j - 1].end = 0;
778}
779
780/*
781 * Split any existing ranges that:
782 * 1) are marked 'overlap_ok', and
783 * 2) overlap with the stated range [start, end)
784 * into whatever portion (if any) of the existing range is entirely
785 * below or entirely above the stated range. Drop the portion
786 * of the existing range that overlaps with the stated range,
787 * which will allow the caller of this routine to then add that
788 * stated range without conflicting with any existing range.
789 */ 726 */
790static void __init drop_overlaps_that_are_ok(u64 start, u64 end) 727u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
791{ 728{
792 int i; 729 int i;
793 struct early_res *r;
794 u64 lower_start, lower_end;
795 u64 upper_start, upper_end;
796 char name[16];
797 730
798 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { 731 for (i = 0; i < e820.nr_map; i++) {
799 r = &early_res[i]; 732 struct e820entry *ei = &e820.map[i];
733 u64 addr;
734 u64 ei_start, ei_last;
800 735
801 /* Continue past non-overlapping ranges */ 736 if (ei->type != E820_RAM)
802 if (end <= r->start || start >= r->end)
803 continue; 737 continue;
804 738
805 /* 739 ei_last = ei->addr + ei->size;
806 * Leave non-ok overlaps as is; let caller 740 ei_start = ei->addr;
807 * panic "Overlapping early reservations" 741 addr = find_early_area(ei_start, ei_last, start, end,
808 * when it hits this overlap. 742 size, align);
809 */
810 if (!r->overlap_ok)
811 return;
812
813 /*
814 * We have an ok overlap. We will drop it from the early
815 * reservation map, and add back in any non-overlapping
816 * portions (lower or upper) as separate, overlap_ok,
817 * non-overlapping ranges.
818 */
819
820 /* 1. Note any non-overlapping (lower or upper) ranges. */
821 strncpy(name, r->name, sizeof(name) - 1);
822
823 lower_start = lower_end = 0;
824 upper_start = upper_end = 0;
825 if (r->start < start) {
826 lower_start = r->start;
827 lower_end = start;
828 }
829 if (r->end > end) {
830 upper_start = end;
831 upper_end = r->end;
832 }
833
834 /* 2. Drop the original ok overlapping range */
835 drop_range(i);
836
837 i--; /* resume for-loop on copied down entry */
838
839 /* 3. Add back in any non-overlapping ranges. */
840 if (lower_end)
841 reserve_early_overlap_ok(lower_start, lower_end, name);
842 if (upper_end)
843 reserve_early_overlap_ok(upper_start, upper_end, name);
844 }
845}
846
847static void __init __reserve_early(u64 start, u64 end, char *name,
848 int overlap_ok)
849{
850 int i;
851 struct early_res *r;
852
853 i = find_overlapped_early(start, end);
854 if (i >= MAX_EARLY_RES)
855 panic("Too many early reservations");
856 r = &early_res[i];
857 if (r->end)
858 panic("Overlapping early reservations "
859 "%llx-%llx %s to %llx-%llx %s\n",
860 start, end - 1, name?name:"", r->start,
861 r->end - 1, r->name);
862 r->start = start;
863 r->end = end;
864 r->overlap_ok = overlap_ok;
865 if (name)
866 strncpy(r->name, name, sizeof(r->name) - 1);
867}
868
869/*
870 * A few early reservtations come here.
871 *
872 * The 'overlap_ok' in the name of this routine does -not- mean it
873 * is ok for these reservations to overlap an earlier reservation.
874 * Rather it means that it is ok for subsequent reservations to
875 * overlap this one.
876 *
877 * Use this entry point to reserve early ranges when you are doing
878 * so out of "Paranoia", reserving perhaps more memory than you need,
879 * just in case, and don't mind a subsequent overlapping reservation
880 * that is known to be needed.
881 *
882 * The drop_overlaps_that_are_ok() call here isn't really needed.
883 * It would be needed if we had two colliding 'overlap_ok'
884 * reservations, so that the second such would not panic on the
885 * overlap with the first. We don't have any such as of this
886 * writing, but might as well tolerate such if it happens in
887 * the future.
888 */
889void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
890{
891 drop_overlaps_that_are_ok(start, end);
892 __reserve_early(start, end, name, 1);
893}
894
895/*
896 * Most early reservations come here.
897 *
898 * We first have drop_overlaps_that_are_ok() drop any pre-existing
899 * 'overlap_ok' ranges, so that we can then reserve this memory
900 * range without risk of panic'ing on an overlapping overlap_ok
901 * early reservation.
902 */
903void __init reserve_early(u64 start, u64 end, char *name)
904{
905 if (start >= end)
906 return;
907
908 drop_overlaps_that_are_ok(start, end);
909 __reserve_early(start, end, name, 0);
910}
911
912void __init free_early(u64 start, u64 end)
913{
914 struct early_res *r;
915 int i;
916
917 i = find_overlapped_early(start, end);
918 r = &early_res[i];
919 if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
920 panic("free_early on not reserved area: %llx-%llx!",
921 start, end - 1);
922
923 drop_range(i);
924}
925
926void __init early_res_to_bootmem(u64 start, u64 end)
927{
928 int i, count;
929 u64 final_start, final_end;
930
931 count = 0;
932 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
933 count++;
934
935 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
936 count, start, end);
937 for (i = 0; i < count; i++) {
938 struct early_res *r = &early_res[i];
939 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
940 r->start, r->end, r->name);
941 final_start = max(start, r->start);
942 final_end = min(end, r->end);
943 if (final_start >= final_end) {
944 printk(KERN_CONT "\n");
945 continue;
946 }
947 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
948 final_start, final_end);
949 reserve_bootmem_generic(final_start, final_end - final_start,
950 BOOTMEM_DEFAULT);
951 }
952}
953 743
954/* Check for already reserved areas */ 744 if (addr != -1ULL)
955static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) 745 return addr;
956{
957 int i;
958 u64 addr = *addrp;
959 int changed = 0;
960 struct early_res *r;
961again:
962 i = find_overlapped_early(addr, addr + size);
963 r = &early_res[i];
964 if (i < MAX_EARLY_RES && r->end) {
965 *addrp = addr = round_up(r->end, align);
966 changed = 1;
967 goto again;
968 } 746 }
969 return changed; 747 return -1ULL;
970} 748}
971 749
972/* Check for already reserved areas */ 750u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
973static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
974{ 751{
975 int i; 752 return find_e820_area(start, end, size, align);
976 u64 addr = *addrp, last;
977 u64 size = *sizep;
978 int changed = 0;
979again:
980 last = addr + size;
981 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
982 struct early_res *r = &early_res[i];
983 if (last > r->start && addr < r->start) {
984 size = r->start - addr;
985 changed = 1;
986 goto again;
987 }
988 if (last > r->end && addr < r->end) {
989 addr = round_up(r->end, align);
990 size = last - addr;
991 changed = 1;
992 goto again;
993 }
994 if (last <= r->end && addr >= r->start) {
995 (*sizep)++;
996 return 0;
997 }
998 }
999 if (changed) {
1000 *addrp = addr;
1001 *sizep = size;
1002 }
1003 return changed;
1004} 753}
1005 754
1006/* 755u64 __init get_max_mapped(void)
1007 * Find a free area with specified alignment in a specific range.
1008 */
1009u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
1010{ 756{
1011 int i; 757 u64 end = max_pfn_mapped;
1012 758
1013 for (i = 0; i < e820.nr_map; i++) { 759 end <<= PAGE_SHIFT;
1014 struct e820entry *ei = &e820.map[i];
1015 u64 addr, last;
1016 u64 ei_last;
1017 760
1018 if (ei->type != E820_RAM) 761 return end;
1019 continue;
1020 addr = round_up(ei->addr, align);
1021 ei_last = ei->addr + ei->size;
1022 if (addr < start)
1023 addr = round_up(start, align);
1024 if (addr >= ei_last)
1025 continue;
1026 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
1027 ;
1028 last = addr + size;
1029 if (last > ei_last)
1030 continue;
1031 if (last > end)
1032 continue;
1033 return addr;
1034 }
1035 return -1ULL;
1036} 762}
1037
1038/* 763/*
1039 * Find next free range after *start 764 * Find next free range after *start
1040 */ 765 */
@@ -1044,25 +769,19 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
1044 769
1045 for (i = 0; i < e820.nr_map; i++) { 770 for (i = 0; i < e820.nr_map; i++) {
1046 struct e820entry *ei = &e820.map[i]; 771 struct e820entry *ei = &e820.map[i];
1047 u64 addr, last; 772 u64 addr;
1048 u64 ei_last; 773 u64 ei_start, ei_last;
1049 774
1050 if (ei->type != E820_RAM) 775 if (ei->type != E820_RAM)
1051 continue; 776 continue;
1052 addr = round_up(ei->addr, align); 777
1053 ei_last = ei->addr + ei->size; 778 ei_last = ei->addr + ei->size;
1054 if (addr < start) 779 ei_start = ei->addr;
1055 addr = round_up(start, align); 780 addr = find_early_area_size(ei_start, ei_last, start,
1056 if (addr >= ei_last) 781 sizep, align);
1057 continue; 782
1058 *sizep = ei_last - addr; 783 if (addr != -1ULL)
1059 while (bad_addr_size(&addr, sizep, align) && 784 return addr;
1060 addr + *sizep <= ei_last)
1061 ;
1062 last = addr + *sizep;
1063 if (last > ei_last)
1064 continue;
1065 return addr;
1066 } 785 }
1067 786
1068 return -1ULL; 787 return -1ULL;
@@ -1421,6 +1140,8 @@ void __init e820_reserve_resources_late(void)
1421 end = MAX_RESOURCE_SIZE; 1140 end = MAX_RESOURCE_SIZE;
1422 if (start >= end) 1141 if (start >= end)
1423 continue; 1142 continue;
1143 printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
1144 start, end);
1424 reserve_region_with_split(&iomem_resource, start, end, 1145 reserve_region_with_split(&iomem_resource, start, end,
1425 "RAM buffer"); 1146 "RAM buffer");
1426 } 1147 }
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index cdcfb122f256..c2fa9b8b497e 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -362,7 +362,7 @@ void __init efi_init(void)
362 printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); 362 printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
363 early_iounmap(tmp, 2); 363 early_iounmap(tmp, 2);
364 364
365 printk(KERN_INFO "EFI v%u.%.02u by %s \n", 365 printk(KERN_INFO "EFI v%u.%.02u by %s\n",
366 efi.systab->hdr.revision >> 16, 366 efi.systab->hdr.revision >> 16,
367 efi.systab->hdr.revision & 0xffff, vendor); 367 efi.systab->hdr.revision & 0xffff, vendor);
368 368
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 309689245431..cd37469b54ee 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -30,14 +30,32 @@
30 30
31#ifdef CONFIG_DYNAMIC_FTRACE 31#ifdef CONFIG_DYNAMIC_FTRACE
32 32
33/*
34 * modifying_code is set to notify NMIs that they need to use
35 * memory barriers when entering or exiting. But we don't want
36 * to burden NMIs with unnecessary memory barriers when code
37 * modification is not being done (which is most of the time).
38 *
39 * A mutex is already held when ftrace_arch_code_modify_prepare
40 * and post_process are called. No locks need to be taken here.
41 *
42 * Stop machine will make sure currently running NMIs are done
43 * and new NMIs will see the updated variable before we need
44 * to worry about NMIs doing memory barriers.
45 */
46static int modifying_code __read_mostly;
47static DEFINE_PER_CPU(int, save_modifying_code);
48
33int ftrace_arch_code_modify_prepare(void) 49int ftrace_arch_code_modify_prepare(void)
34{ 50{
35 set_kernel_text_rw(); 51 set_kernel_text_rw();
52 modifying_code = 1;
36 return 0; 53 return 0;
37} 54}
38 55
39int ftrace_arch_code_modify_post_process(void) 56int ftrace_arch_code_modify_post_process(void)
40{ 57{
58 modifying_code = 0;
41 set_kernel_text_ro(); 59 set_kernel_text_ro();
42 return 0; 60 return 0;
43} 61}
@@ -149,6 +167,11 @@ static void ftrace_mod_code(void)
149 167
150void ftrace_nmi_enter(void) 168void ftrace_nmi_enter(void)
151{ 169{
170 __get_cpu_var(save_modifying_code) = modifying_code;
171
172 if (!__get_cpu_var(save_modifying_code))
173 return;
174
152 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { 175 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
153 smp_rmb(); 176 smp_rmb();
154 ftrace_mod_code(); 177 ftrace_mod_code();
@@ -160,6 +183,9 @@ void ftrace_nmi_enter(void)
160 183
161void ftrace_nmi_exit(void) 184void ftrace_nmi_exit(void)
162{ 185{
186 if (!__get_cpu_var(save_modifying_code))
187 return;
188
163 /* Finish all executions before clearing nmi_running */ 189 /* Finish all executions before clearing nmi_running */
164 smp_mb(); 190 smp_mb();
165 atomic_dec(&nmi_running); 191 atomic_dec(&nmi_running);
@@ -484,13 +510,3 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
484 } 510 }
485} 511}
486#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 512#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
487
488#ifdef CONFIG_FTRACE_SYSCALLS
489
490extern unsigned long *sys_call_table;
491
492unsigned long __init arch_syscall_addr(int nr)
493{
494 return (unsigned long)(&sys_call_table)[nr];
495}
496#endif
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 5051b94c9069..adedeef1dedc 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -29,6 +29,16 @@ static void __init i386_default_early_setup(void)
29 29
30void __init i386_start_kernel(void) 30void __init i386_start_kernel(void)
31{ 31{
32#ifdef CONFIG_X86_TRAMPOLINE
33 /*
34 * But first pinch a few for the stack/trampoline stuff
35 * FIXME: Don't need the extra page at 4K, but need to fix
36 * trampoline before removing it. (see the GDT stuff)
37 */
38 reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
39 "EX TRAMPOLINE");
40#endif
41
32 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 42 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
33 43
34#ifdef CONFIG_BLK_DEV_INITRD 44#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 7fd318bac59c..37c3d4b17d85 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -442,8 +442,8 @@ is386: movl $2,%ecx # set MP
442 */ 442 */
443 cmpb $0,ready 443 cmpb $0,ready
444 jne 1f 444 jne 1f
445 movl $per_cpu__gdt_page,%eax 445 movl $gdt_page,%eax
446 movl $per_cpu__stack_canary,%ecx 446 movl $stack_canary,%ecx
447 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) 447 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
448 shrl $16, %ecx 448 shrl $16, %ecx
449 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) 449 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
@@ -706,7 +706,7 @@ idt_descr:
706 .word 0 # 32 bit align gdt_desc.address 706 .word 0 # 32 bit align gdt_desc.address
707ENTRY(early_gdt_descr) 707ENTRY(early_gdt_descr)
708 .word GDT_ENTRIES*8-1 708 .word GDT_ENTRIES*8-1
709 .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ 709 .long gdt_page /* Overwritten for secondary CPUs */
710 710
711/* 711/*
712 * The boot_gdt must mirror the equivalent in setup.S and is 712 * The boot_gdt must mirror the equivalent in setup.S and is
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ad80a1c718c6..ee4fa1bfcb33 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -266,7 +266,7 @@ static void hpet_resume_device(void)
266 force_hpet_resume(); 266 force_hpet_resume();
267} 267}
268 268
269static void hpet_resume_counter(void) 269static void hpet_resume_counter(struct clocksource *cs)
270{ 270{
271 hpet_resume_device(); 271 hpet_resume_device();
272 hpet_restart_counter(); 272 hpet_restart_counter();
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 05d5fec64a94..dca2802c666f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -212,25 +212,6 @@ static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); 212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213} 213}
214 214
215/*
216 * Store a breakpoint's encoded address, length, and type.
217 */
218static int arch_store_info(struct perf_event *bp)
219{
220 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
221 /*
222 * For kernel-addresses, either the address or symbol name can be
223 * specified.
224 */
225 if (info->name)
226 info->address = (unsigned long)
227 kallsyms_lookup_name(info->name);
228 if (info->address)
229 return 0;
230
231 return -EINVAL;
232}
233
234int arch_bp_generic_fields(int x86_len, int x86_type, 215int arch_bp_generic_fields(int x86_len, int x86_type,
235 int *gen_len, int *gen_type) 216 int *gen_len, int *gen_type)
236{ 217{
@@ -362,10 +343,13 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
362 return ret; 343 return ret;
363 } 344 }
364 345
365 ret = arch_store_info(bp); 346 /*
366 347 * For kernel-addresses, either the address or symbol name can be
367 if (ret < 0) 348 * specified.
368 return ret; 349 */
350 if (info->name)
351 info->address = (unsigned long)
352 kallsyms_lookup_name(info->name);
369 /* 353 /*
370 * Check that the low-order bits of the address are appropriate 354 * Check that the low-order bits of the address are appropriate
371 * for the alignment implied by len. 355 * for the alignment implied by len.
@@ -502,8 +486,6 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
502 rcu_read_lock(); 486 rcu_read_lock();
503 487
504 bp = per_cpu(bp_per_reg[i], cpu); 488 bp = per_cpu(bp_per_reg[i], cpu);
505 if (bp)
506 rc = NOTIFY_DONE;
507 /* 489 /*
508 * Reset the 'i'th TRAP bit in dr6 to denote completion of 490 * Reset the 'i'th TRAP bit in dr6 to denote completion of
509 * exception handling 491 * exception handling
@@ -522,7 +504,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
522 504
523 rcu_read_unlock(); 505 rcu_read_unlock();
524 } 506 }
525 if (dr6 & (~DR_TRAP_BITS)) 507 /*
508 * Further processing in do_debug() is needed for a) user-space
509 * breakpoints (to generate signals) and b) when the system has
510 * taken exception due to multiple causes
511 */
512 if ((current->thread.debugreg6 & DR_TRAP_BITS) ||
513 (dr6 & (~DR_TRAP_BITS)))
526 rc = NOTIFY_DONE; 514 rc = NOTIFY_DONE;
527 515
528 set_debugreg(dr7, 7); 516 set_debugreg(dr7, 7);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index f2f8540a7f3d..c01a2b846d47 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -164,6 +164,11 @@ int init_fpu(struct task_struct *tsk)
164 return 0; 164 return 0;
165} 165}
166 166
167/*
168 * The xstateregs_active() routine is the same as the fpregs_active() routine,
169 * as the "regset->n" for the xstate regset will be updated based on the feature
170 * capabilites supported by the xsave.
171 */
167int fpregs_active(struct task_struct *target, const struct user_regset *regset) 172int fpregs_active(struct task_struct *target, const struct user_regset *regset)
168{ 173{
169 return tsk_used_math(target) ? regset->n : 0; 174 return tsk_used_math(target) ? regset->n : 0;
@@ -204,8 +209,6 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
204 if (ret) 209 if (ret)
205 return ret; 210 return ret;
206 211
207 set_stopped_child_used_math(target);
208
209 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 212 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
210 &target->thread.xstate->fxsave, 0, -1); 213 &target->thread.xstate->fxsave, 0, -1);
211 214
@@ -224,6 +227,68 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
224 return ret; 227 return ret;
225} 228}
226 229
230int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
231 unsigned int pos, unsigned int count,
232 void *kbuf, void __user *ubuf)
233{
234 int ret;
235
236 if (!cpu_has_xsave)
237 return -ENODEV;
238
239 ret = init_fpu(target);
240 if (ret)
241 return ret;
242
243 /*
244 * Copy the 48bytes defined by the software first into the xstate
245 * memory layout in the thread struct, so that we can copy the entire
246 * xstateregs to the user using one user_regset_copyout().
247 */
248 memcpy(&target->thread.xstate->fxsave.sw_reserved,
249 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
250
251 /*
252 * Copy the xstate memory layout.
253 */
254 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
255 &target->thread.xstate->xsave, 0, -1);
256 return ret;
257}
258
259int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
260 unsigned int pos, unsigned int count,
261 const void *kbuf, const void __user *ubuf)
262{
263 int ret;
264 struct xsave_hdr_struct *xsave_hdr;
265
266 if (!cpu_has_xsave)
267 return -ENODEV;
268
269 ret = init_fpu(target);
270 if (ret)
271 return ret;
272
273 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
274 &target->thread.xstate->xsave, 0, -1);
275
276 /*
277 * mxcsr reserved bits must be masked to zero for security reasons.
278 */
279 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
280
281 xsave_hdr = &target->thread.xstate->xsave.xsave_hdr;
282
283 xsave_hdr->xstate_bv &= pcntxt_mask;
284 /*
285 * These bits must be zero.
286 */
287 xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
288
289 return ret;
290}
291
227#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 292#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
228 293
229/* 294/*
@@ -404,8 +469,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
404 if (ret) 469 if (ret)
405 return ret; 470 return ret;
406 471
407 set_stopped_child_used_math(target);
408
409 if (!HAVE_HWFP) 472 if (!HAVE_HWFP)
410 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); 473 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
411 474
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 5b8c7505b3bc..b43bbaebe2c0 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -49,6 +49,7 @@
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kdebug.h> 50#include <linux/kdebug.h>
51#include <linux/kallsyms.h> 51#include <linux/kallsyms.h>
52#include <linux/ftrace.h>
52 53
53#include <asm/cacheflush.h> 54#include <asm/cacheflush.h>
54#include <asm/desc.h> 55#include <asm/desc.h>
@@ -106,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
106}; 107};
107const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); 108const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
108 109
109/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ 110static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
110static void __kprobes set_jmp_op(void *from, void *to)
111{ 111{
112 struct __arch_jmp_op { 112 struct __arch_relative_insn {
113 char op; 113 u8 op;
114 s32 raddr; 114 s32 raddr;
115 } __attribute__((packed)) * jop; 115 } __attribute__((packed)) *insn;
116 jop = (struct __arch_jmp_op *)from; 116
117 jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); 117 insn = (struct __arch_relative_insn *)from;
118 jop->op = RELATIVEJUMP_INSTRUCTION; 118 insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
119 insn->op = op;
120}
121
122/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
123static void __kprobes synthesize_reljump(void *from, void *to)
124{
125 __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
119} 126}
120 127
121/* 128/*
@@ -202,7 +209,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
202 /* 209 /*
203 * Basically, kp->ainsn.insn has an original instruction. 210 * Basically, kp->ainsn.insn has an original instruction.
204 * However, RIP-relative instruction can not do single-stepping 211 * However, RIP-relative instruction can not do single-stepping
205 * at different place, fix_riprel() tweaks the displacement of 212 * at different place, __copy_instruction() tweaks the displacement of
206 * that instruction. In that case, we can't recover the instruction 213 * that instruction. In that case, we can't recover the instruction
207 * from the kp->ainsn.insn. 214 * from the kp->ainsn.insn.
208 * 215 *
@@ -284,21 +291,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
284} 291}
285 292
286/* 293/*
287 * Adjust the displacement if the instruction uses the %rip-relative 294 * Copy an instruction and adjust the displacement if the instruction
288 * addressing mode. 295 * uses the %rip-relative addressing mode.
289 * If it does, Return the address of the 32-bit displacement word. 296 * If it does, Return the address of the 32-bit displacement word.
290 * If not, return null. 297 * If not, return null.
291 * Only applicable to 64-bit x86. 298 * Only applicable to 64-bit x86.
292 */ 299 */
293static void __kprobes fix_riprel(struct kprobe *p) 300static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
294{ 301{
295#ifdef CONFIG_X86_64
296 struct insn insn; 302 struct insn insn;
297 kernel_insn_init(&insn, p->ainsn.insn); 303 int ret;
304 kprobe_opcode_t buf[MAX_INSN_SIZE];
298 305
306 kernel_insn_init(&insn, src);
307 if (recover) {
308 insn_get_opcode(&insn);
309 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
310 ret = recover_probed_instruction(buf,
311 (unsigned long)src);
312 if (ret)
313 return 0;
314 kernel_insn_init(&insn, buf);
315 }
316 }
317 insn_get_length(&insn);
318 memcpy(dest, insn.kaddr, insn.length);
319
320#ifdef CONFIG_X86_64
299 if (insn_rip_relative(&insn)) { 321 if (insn_rip_relative(&insn)) {
300 s64 newdisp; 322 s64 newdisp;
301 u8 *disp; 323 u8 *disp;
324 kernel_insn_init(&insn, dest);
302 insn_get_displacement(&insn); 325 insn_get_displacement(&insn);
303 /* 326 /*
304 * The copied instruction uses the %rip-relative addressing 327 * The copied instruction uses the %rip-relative addressing
@@ -312,20 +335,23 @@ static void __kprobes fix_riprel(struct kprobe *p)
312 * extension of the original signed 32-bit displacement would 335 * extension of the original signed 32-bit displacement would
313 * have given. 336 * have given.
314 */ 337 */
315 newdisp = (u8 *) p->addr + (s64) insn.displacement.value - 338 newdisp = (u8 *) src + (s64) insn.displacement.value -
316 (u8 *) p->ainsn.insn; 339 (u8 *) dest;
317 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ 340 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
318 disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); 341 disp = (u8 *) dest + insn_offset_displacement(&insn);
319 *(s32 *) disp = (s32) newdisp; 342 *(s32 *) disp = (s32) newdisp;
320 } 343 }
321#endif 344#endif
345 return insn.length;
322} 346}
323 347
324static void __kprobes arch_copy_kprobe(struct kprobe *p) 348static void __kprobes arch_copy_kprobe(struct kprobe *p)
325{ 349{
326 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); 350 /*
327 351 * Copy an instruction without recovering int3, because it will be
328 fix_riprel(p); 352 * put by another subsystem.
353 */
354 __copy_instruction(p->ainsn.insn, p->addr, 0);
329 355
330 if (can_boost(p->addr)) 356 if (can_boost(p->addr))
331 p->ainsn.boostable = 0; 357 p->ainsn.boostable = 0;
@@ -337,6 +363,9 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
337 363
338int __kprobes arch_prepare_kprobe(struct kprobe *p) 364int __kprobes arch_prepare_kprobe(struct kprobe *p)
339{ 365{
366 if (alternatives_text_reserved(p->addr, p->addr))
367 return -EINVAL;
368
340 if (!can_probe((unsigned long)p->addr)) 369 if (!can_probe((unsigned long)p->addr))
341 return -EILSEQ; 370 return -EILSEQ;
342 /* insn: must be on special executable page on x86. */ 371 /* insn: must be on special executable page on x86. */
@@ -403,18 +432,6 @@ static void __kprobes restore_btf(void)
403 update_debugctlmsr(current->thread.debugctlmsr); 432 update_debugctlmsr(current->thread.debugctlmsr);
404} 433}
405 434
406static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
407{
408 clear_btf();
409 regs->flags |= X86_EFLAGS_TF;
410 regs->flags &= ~X86_EFLAGS_IF;
411 /* single step inline if the instruction is an int3 */
412 if (p->opcode == BREAKPOINT_INSTRUCTION)
413 regs->ip = (unsigned long)p->addr;
414 else
415 regs->ip = (unsigned long)p->ainsn.insn;
416}
417
418void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
419 struct pt_regs *regs) 436 struct pt_regs *regs)
420{ 437{
@@ -426,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
426 *sara = (unsigned long) &kretprobe_trampoline; 443 *sara = (unsigned long) &kretprobe_trampoline;
427} 444}
428 445
446#ifdef CONFIG_OPTPROBES
447static int __kprobes setup_detour_execution(struct kprobe *p,
448 struct pt_regs *regs,
449 int reenter);
450#else
451#define setup_detour_execution(p, regs, reenter) (0)
452#endif
453
429static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, 454static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
430 struct kprobe_ctlblk *kcb) 455 struct kprobe_ctlblk *kcb, int reenter)
431{ 456{
432#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER) 457 if (setup_detour_execution(p, regs, reenter))
458 return;
459
460#if !defined(CONFIG_PREEMPT)
433 if (p->ainsn.boostable == 1 && !p->post_handler) { 461 if (p->ainsn.boostable == 1 && !p->post_handler) {
434 /* Boost up -- we can execute copied instructions directly */ 462 /* Boost up -- we can execute copied instructions directly */
435 reset_current_kprobe(); 463 if (!reenter)
464 reset_current_kprobe();
465 /*
466 * Reentering boosted probe doesn't reset current_kprobe,
467 * nor set current_kprobe, because it doesn't use single
468 * stepping.
469 */
436 regs->ip = (unsigned long)p->ainsn.insn; 470 regs->ip = (unsigned long)p->ainsn.insn;
437 preempt_enable_no_resched(); 471 preempt_enable_no_resched();
438 return; 472 return;
439 } 473 }
440#endif 474#endif
441 prepare_singlestep(p, regs); 475 if (reenter) {
442 kcb->kprobe_status = KPROBE_HIT_SS; 476 save_previous_kprobe(kcb);
477 set_current_kprobe(p, regs, kcb);
478 kcb->kprobe_status = KPROBE_REENTER;
479 } else
480 kcb->kprobe_status = KPROBE_HIT_SS;
481 /* Prepare real single stepping */
482 clear_btf();
483 regs->flags |= X86_EFLAGS_TF;
484 regs->flags &= ~X86_EFLAGS_IF;
485 /* single step inline if the instruction is an int3 */
486 if (p->opcode == BREAKPOINT_INSTRUCTION)
487 regs->ip = (unsigned long)p->addr;
488 else
489 regs->ip = (unsigned long)p->ainsn.insn;
443} 490}
444 491
445/* 492/*
@@ -453,11 +500,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
453 switch (kcb->kprobe_status) { 500 switch (kcb->kprobe_status) {
454 case KPROBE_HIT_SSDONE: 501 case KPROBE_HIT_SSDONE:
455 case KPROBE_HIT_ACTIVE: 502 case KPROBE_HIT_ACTIVE:
456 save_previous_kprobe(kcb);
457 set_current_kprobe(p, regs, kcb);
458 kprobes_inc_nmissed_count(p); 503 kprobes_inc_nmissed_count(p);
459 prepare_singlestep(p, regs); 504 setup_singlestep(p, regs, kcb, 1);
460 kcb->kprobe_status = KPROBE_REENTER;
461 break; 505 break;
462 case KPROBE_HIT_SS: 506 case KPROBE_HIT_SS:
463 /* A probe has been hit in the codepath leading up to, or just 507 /* A probe has been hit in the codepath leading up to, or just
@@ -532,13 +576,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
532 * more here. 576 * more here.
533 */ 577 */
534 if (!p->pre_handler || !p->pre_handler(p, regs)) 578 if (!p->pre_handler || !p->pre_handler(p, regs))
535 setup_singlestep(p, regs, kcb); 579 setup_singlestep(p, regs, kcb, 0);
536 return 1; 580 return 1;
537 } 581 }
538 } else if (kprobe_running()) { 582 } else if (kprobe_running()) {
539 p = __get_cpu_var(current_kprobe); 583 p = __get_cpu_var(current_kprobe);
540 if (p->break_handler && p->break_handler(p, regs)) { 584 if (p->break_handler && p->break_handler(p, regs)) {
541 setup_singlestep(p, regs, kcb); 585 setup_singlestep(p, regs, kcb, 0);
542 return 1; 586 return 1;
543 } 587 }
544 } /* else: not a kprobe fault; let the kernel handle it */ 588 } /* else: not a kprobe fault; let the kernel handle it */
@@ -547,6 +591,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
547 return 0; 591 return 0;
548} 592}
549 593
594#ifdef CONFIG_X86_64
595#define SAVE_REGS_STRING \
596 /* Skip cs, ip, orig_ax. */ \
597 " subq $24, %rsp\n" \
598 " pushq %rdi\n" \
599 " pushq %rsi\n" \
600 " pushq %rdx\n" \
601 " pushq %rcx\n" \
602 " pushq %rax\n" \
603 " pushq %r8\n" \
604 " pushq %r9\n" \
605 " pushq %r10\n" \
606 " pushq %r11\n" \
607 " pushq %rbx\n" \
608 " pushq %rbp\n" \
609 " pushq %r12\n" \
610 " pushq %r13\n" \
611 " pushq %r14\n" \
612 " pushq %r15\n"
613#define RESTORE_REGS_STRING \
614 " popq %r15\n" \
615 " popq %r14\n" \
616 " popq %r13\n" \
617 " popq %r12\n" \
618 " popq %rbp\n" \
619 " popq %rbx\n" \
620 " popq %r11\n" \
621 " popq %r10\n" \
622 " popq %r9\n" \
623 " popq %r8\n" \
624 " popq %rax\n" \
625 " popq %rcx\n" \
626 " popq %rdx\n" \
627 " popq %rsi\n" \
628 " popq %rdi\n" \
629 /* Skip orig_ax, ip, cs */ \
630 " addq $24, %rsp\n"
631#else
632#define SAVE_REGS_STRING \
633 /* Skip cs, ip, orig_ax and gs. */ \
634 " subl $16, %esp\n" \
635 " pushl %fs\n" \
636 " pushl %ds\n" \
637 " pushl %es\n" \
638 " pushl %eax\n" \
639 " pushl %ebp\n" \
640 " pushl %edi\n" \
641 " pushl %esi\n" \
642 " pushl %edx\n" \
643 " pushl %ecx\n" \
644 " pushl %ebx\n"
645#define RESTORE_REGS_STRING \
646 " popl %ebx\n" \
647 " popl %ecx\n" \
648 " popl %edx\n" \
649 " popl %esi\n" \
650 " popl %edi\n" \
651 " popl %ebp\n" \
652 " popl %eax\n" \
653 /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
654 " addl $24, %esp\n"
655#endif
656
550/* 657/*
551 * When a retprobed function returns, this code saves registers and 658 * When a retprobed function returns, this code saves registers and
552 * calls trampoline_handler() runs, which calls the kretprobe's handler. 659 * calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -560,65 +667,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
560 /* We don't bother saving the ss register */ 667 /* We don't bother saving the ss register */
561 " pushq %rsp\n" 668 " pushq %rsp\n"
562 " pushfq\n" 669 " pushfq\n"
563 /* 670 SAVE_REGS_STRING
564 * Skip cs, ip, orig_ax.
565 * trampoline_handler() will plug in these values
566 */
567 " subq $24, %rsp\n"
568 " pushq %rdi\n"
569 " pushq %rsi\n"
570 " pushq %rdx\n"
571 " pushq %rcx\n"
572 " pushq %rax\n"
573 " pushq %r8\n"
574 " pushq %r9\n"
575 " pushq %r10\n"
576 " pushq %r11\n"
577 " pushq %rbx\n"
578 " pushq %rbp\n"
579 " pushq %r12\n"
580 " pushq %r13\n"
581 " pushq %r14\n"
582 " pushq %r15\n"
583 " movq %rsp, %rdi\n" 671 " movq %rsp, %rdi\n"
584 " call trampoline_handler\n" 672 " call trampoline_handler\n"
585 /* Replace saved sp with true return address. */ 673 /* Replace saved sp with true return address. */
586 " movq %rax, 152(%rsp)\n" 674 " movq %rax, 152(%rsp)\n"
587 " popq %r15\n" 675 RESTORE_REGS_STRING
588 " popq %r14\n"
589 " popq %r13\n"
590 " popq %r12\n"
591 " popq %rbp\n"
592 " popq %rbx\n"
593 " popq %r11\n"
594 " popq %r10\n"
595 " popq %r9\n"
596 " popq %r8\n"
597 " popq %rax\n"
598 " popq %rcx\n"
599 " popq %rdx\n"
600 " popq %rsi\n"
601 " popq %rdi\n"
602 /* Skip orig_ax, ip, cs */
603 " addq $24, %rsp\n"
604 " popfq\n" 676 " popfq\n"
605#else 677#else
606 " pushf\n" 678 " pushf\n"
607 /* 679 SAVE_REGS_STRING
608 * Skip cs, ip, orig_ax and gs.
609 * trampoline_handler() will plug in these values
610 */
611 " subl $16, %esp\n"
612 " pushl %fs\n"
613 " pushl %es\n"
614 " pushl %ds\n"
615 " pushl %eax\n"
616 " pushl %ebp\n"
617 " pushl %edi\n"
618 " pushl %esi\n"
619 " pushl %edx\n"
620 " pushl %ecx\n"
621 " pushl %ebx\n"
622 " movl %esp, %eax\n" 680 " movl %esp, %eax\n"
623 " call trampoline_handler\n" 681 " call trampoline_handler\n"
624 /* Move flags to cs */ 682 /* Move flags to cs */
@@ -626,15 +684,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
626 " movl %edx, 52(%esp)\n" 684 " movl %edx, 52(%esp)\n"
627 /* Replace saved flags with true return address. */ 685 /* Replace saved flags with true return address. */
628 " movl %eax, 56(%esp)\n" 686 " movl %eax, 56(%esp)\n"
629 " popl %ebx\n" 687 RESTORE_REGS_STRING
630 " popl %ecx\n"
631 " popl %edx\n"
632 " popl %esi\n"
633 " popl %edi\n"
634 " popl %ebp\n"
635 " popl %eax\n"
636 /* Skip ds, es, fs, gs, orig_ax and ip */
637 " addl $24, %esp\n"
638 " popf\n" 688 " popf\n"
639#endif 689#endif
640 " ret\n"); 690 " ret\n");
@@ -802,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p,
802 * These instructions can be executed directly if it 852 * These instructions can be executed directly if it
803 * jumps back to correct address. 853 * jumps back to correct address.
804 */ 854 */
805 set_jmp_op((void *)regs->ip, 855 synthesize_reljump((void *)regs->ip,
806 (void *)orig_ip + (regs->ip - copy_ip)); 856 (void *)orig_ip + (regs->ip - copy_ip));
807 p->ainsn.boostable = 1; 857 p->ainsn.boostable = 1;
808 } else { 858 } else {
809 p->ainsn.boostable = -1; 859 p->ainsn.boostable = -1;
@@ -1030,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1030 return 0; 1080 return 0;
1031} 1081}
1032 1082
1083
1084#ifdef CONFIG_OPTPROBES
1085
1086/* Insert a call instruction at address 'from', which calls address 'to'.*/
1087static void __kprobes synthesize_relcall(void *from, void *to)
1088{
1089 __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
1090}
1091
1092/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
1093static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
1094 unsigned long val)
1095{
1096#ifdef CONFIG_X86_64
1097 *addr++ = 0x48;
1098 *addr++ = 0xbf;
1099#else
1100 *addr++ = 0xb8;
1101#endif
1102 *(unsigned long *)addr = val;
1103}
1104
1105void __kprobes kprobes_optinsn_template_holder(void)
1106{
1107 asm volatile (
1108 ".global optprobe_template_entry\n"
1109 "optprobe_template_entry: \n"
1110#ifdef CONFIG_X86_64
1111 /* We don't bother saving the ss register */
1112 " pushq %rsp\n"
1113 " pushfq\n"
1114 SAVE_REGS_STRING
1115 " movq %rsp, %rsi\n"
1116 ".global optprobe_template_val\n"
1117 "optprobe_template_val: \n"
1118 ASM_NOP5
1119 ASM_NOP5
1120 ".global optprobe_template_call\n"
1121 "optprobe_template_call: \n"
1122 ASM_NOP5
1123 /* Move flags to rsp */
1124 " movq 144(%rsp), %rdx\n"
1125 " movq %rdx, 152(%rsp)\n"
1126 RESTORE_REGS_STRING
1127 /* Skip flags entry */
1128 " addq $8, %rsp\n"
1129 " popfq\n"
1130#else /* CONFIG_X86_32 */
1131 " pushf\n"
1132 SAVE_REGS_STRING
1133 " movl %esp, %edx\n"
1134 ".global optprobe_template_val\n"
1135 "optprobe_template_val: \n"
1136 ASM_NOP5
1137 ".global optprobe_template_call\n"
1138 "optprobe_template_call: \n"
1139 ASM_NOP5
1140 RESTORE_REGS_STRING
1141 " addl $4, %esp\n" /* skip cs */
1142 " popf\n"
1143#endif
1144 ".global optprobe_template_end\n"
1145 "optprobe_template_end: \n");
1146}
1147
1148#define TMPL_MOVE_IDX \
1149 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
1150#define TMPL_CALL_IDX \
1151 ((long)&optprobe_template_call - (long)&optprobe_template_entry)
1152#define TMPL_END_IDX \
1153 ((long)&optprobe_template_end - (long)&optprobe_template_entry)
1154
1155#define INT3_SIZE sizeof(kprobe_opcode_t)
1156
1157/* Optimized kprobe call back function: called from optinsn */
1158static void __kprobes optimized_callback(struct optimized_kprobe *op,
1159 struct pt_regs *regs)
1160{
1161 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1162
1163 preempt_disable();
1164 if (kprobe_running()) {
1165 kprobes_inc_nmissed_count(&op->kp);
1166 } else {
1167 /* Save skipped registers */
1168#ifdef CONFIG_X86_64
1169 regs->cs = __KERNEL_CS;
1170#else
1171 regs->cs = __KERNEL_CS | get_kernel_rpl();
1172 regs->gs = 0;
1173#endif
1174 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
1175 regs->orig_ax = ~0UL;
1176
1177 __get_cpu_var(current_kprobe) = &op->kp;
1178 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1179 opt_pre_handler(&op->kp, regs);
1180 __get_cpu_var(current_kprobe) = NULL;
1181 }
1182 preempt_enable_no_resched();
1183}
1184
1185static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
1186{
1187 int len = 0, ret;
1188
1189 while (len < RELATIVEJUMP_SIZE) {
1190 ret = __copy_instruction(dest + len, src + len, 1);
1191 if (!ret || !can_boost(dest + len))
1192 return -EINVAL;
1193 len += ret;
1194 }
1195 /* Check whether the address range is reserved */
1196 if (ftrace_text_reserved(src, src + len - 1) ||
1197 alternatives_text_reserved(src, src + len - 1))
1198 return -EBUSY;
1199
1200 return len;
1201}
1202
1203/* Check whether insn is indirect jump */
1204static int __kprobes insn_is_indirect_jump(struct insn *insn)
1205{
1206 return ((insn->opcode.bytes[0] == 0xff &&
1207 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
1208 insn->opcode.bytes[0] == 0xea); /* Segment based jump */
1209}
1210
1211/* Check whether insn jumps into specified address range */
1212static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
1213{
1214 unsigned long target = 0;
1215
1216 switch (insn->opcode.bytes[0]) {
1217 case 0xe0: /* loopne */
1218 case 0xe1: /* loope */
1219 case 0xe2: /* loop */
1220 case 0xe3: /* jcxz */
1221 case 0xe9: /* near relative jump */
1222 case 0xeb: /* short relative jump */
1223 break;
1224 case 0x0f:
1225 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
1226 break;
1227 return 0;
1228 default:
1229 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
1230 break;
1231 return 0;
1232 }
1233 target = (unsigned long)insn->next_byte + insn->immediate.value;
1234
1235 return (start <= target && target <= start + len);
1236}
1237
1238/* Decode whole function to ensure any instructions don't jump into target */
1239static int __kprobes can_optimize(unsigned long paddr)
1240{
1241 int ret;
1242 unsigned long addr, size = 0, offset = 0;
1243 struct insn insn;
1244 kprobe_opcode_t buf[MAX_INSN_SIZE];
1245 /* Dummy buffers for lookup_symbol_attrs */
1246 static char __dummy_buf[KSYM_NAME_LEN];
1247
1248 /* Lookup symbol including addr */
1249 if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
1250 return 0;
1251
1252 /* Check there is enough space for a relative jump. */
1253 if (size - offset < RELATIVEJUMP_SIZE)
1254 return 0;
1255
1256 /* Decode instructions */
1257 addr = paddr - offset;
1258 while (addr < paddr - offset + size) { /* Decode until function end */
1259 if (search_exception_tables(addr))
1260 /*
1261 * Since some fixup code will jumps into this function,
1262 * we can't optimize kprobe in this function.
1263 */
1264 return 0;
1265 kernel_insn_init(&insn, (void *)addr);
1266 insn_get_opcode(&insn);
1267 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
1268 ret = recover_probed_instruction(buf, addr);
1269 if (ret)
1270 return 0;
1271 kernel_insn_init(&insn, buf);
1272 }
1273 insn_get_length(&insn);
1274 /* Recover address */
1275 insn.kaddr = (void *)addr;
1276 insn.next_byte = (void *)(addr + insn.length);
1277 /* Check any instructions don't jump into target */
1278 if (insn_is_indirect_jump(&insn) ||
1279 insn_jump_into_range(&insn, paddr + INT3_SIZE,
1280 RELATIVE_ADDR_SIZE))
1281 return 0;
1282 addr += insn.length;
1283 }
1284
1285 return 1;
1286}
1287
1288/* Check optimized_kprobe can actually be optimized. */
1289int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
1290{
1291 int i;
1292 struct kprobe *p;
1293
1294 for (i = 1; i < op->optinsn.size; i++) {
1295 p = get_kprobe(op->kp.addr + i);
1296 if (p && !kprobe_disabled(p))
1297 return -EEXIST;
1298 }
1299
1300 return 0;
1301}
1302
1303/* Check the addr is within the optimized instructions. */
1304int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
1305 unsigned long addr)
1306{
1307 return ((unsigned long)op->kp.addr <= addr &&
1308 (unsigned long)op->kp.addr + op->optinsn.size > addr);
1309}
1310
1311/* Free optimized instruction slot */
1312static __kprobes
1313void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
1314{
1315 if (op->optinsn.insn) {
1316 free_optinsn_slot(op->optinsn.insn, dirty);
1317 op->optinsn.insn = NULL;
1318 op->optinsn.size = 0;
1319 }
1320}
1321
1322void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
1323{
1324 __arch_remove_optimized_kprobe(op, 1);
1325}
1326
1327/*
1328 * Copy replacing target instructions
1329 * Target instructions MUST be relocatable (checked inside)
1330 */
1331int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1332{
1333 u8 *buf;
1334 int ret;
1335 long rel;
1336
1337 if (!can_optimize((unsigned long)op->kp.addr))
1338 return -EILSEQ;
1339
1340 op->optinsn.insn = get_optinsn_slot();
1341 if (!op->optinsn.insn)
1342 return -ENOMEM;
1343
1344 /*
1345 * Verify if the address gap is in 2GB range, because this uses
1346 * a relative jump.
1347 */
1348 rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
1349 if (abs(rel) > 0x7fffffff)
1350 return -ERANGE;
1351
1352 buf = (u8 *)op->optinsn.insn;
1353
1354 /* Copy instructions into the out-of-line buffer */
1355 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
1356 if (ret < 0) {
1357 __arch_remove_optimized_kprobe(op, 0);
1358 return ret;
1359 }
1360 op->optinsn.size = ret;
1361
1362 /* Copy arch-dep-instance from template */
1363 memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
1364
1365 /* Set probe information */
1366 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
1367
1368 /* Set probe function call */
1369 synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
1370
1371 /* Set returning jmp instruction at the tail of out-of-line buffer */
1372 synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
1373 (u8 *)op->kp.addr + op->optinsn.size);
1374
1375 flush_icache_range((unsigned long) buf,
1376 (unsigned long) buf + TMPL_END_IDX +
1377 op->optinsn.size + RELATIVEJUMP_SIZE);
1378 return 0;
1379}
1380
1381/* Replace a breakpoint (int3) with a relative jump. */
1382int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
1383{
1384 unsigned char jmp_code[RELATIVEJUMP_SIZE];
1385 s32 rel = (s32)((long)op->optinsn.insn -
1386 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1387
1388 /* Backup instructions which will be replaced by jump address */
1389 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1390 RELATIVE_ADDR_SIZE);
1391
1392 jmp_code[0] = RELATIVEJUMP_OPCODE;
1393 *(s32 *)(&jmp_code[1]) = rel;
1394
1395 /*
1396 * text_poke_smp doesn't support NMI/MCE code modifying.
1397 * However, since kprobes itself also doesn't support NMI/MCE
1398 * code probing, it's not a problem.
1399 */
1400 text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
1401 return 0;
1402}
1403
1404/* Replace a relative jump with a breakpoint (int3). */
1405void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
1406{
1407 u8 buf[RELATIVEJUMP_SIZE];
1408
1409 /* Set int3 to first byte for kprobes */
1410 buf[0] = BREAKPOINT_INSTRUCTION;
1411 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1412 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
1413}
1414
1415static int __kprobes setup_detour_execution(struct kprobe *p,
1416 struct pt_regs *regs,
1417 int reenter)
1418{
1419 struct optimized_kprobe *op;
1420
1421 if (p->flags & KPROBE_FLAG_OPTIMIZED) {
1422 /* This kprobe is really able to run optimized path. */
1423 op = container_of(p, struct optimized_kprobe, kp);
1424 /* Detour through copied instructions */
1425 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
1426 if (!reenter)
1427 reset_current_kprobe();
1428 preempt_enable_no_resched();
1429 return 1;
1430 }
1431 return 0;
1432}
1433#endif
1434
1033int __init arch_init_kprobes(void) 1435int __init arch_init_kprobes(void)
1034{ 1436{
1035 return 0; 1437 return 0;
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index ebd193e476ca..85a343e28937 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -328,7 +328,7 @@ static int apply_microcode(int cpu)
328 cpu_num, mc_intel->hdr.rev); 328 cpu_num, mc_intel->hdr.rev);
329 return -1; 329 return -1;
330 } 330 }
331 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n", 331 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
332 cpu_num, val[1], 332 cpu_num, val[1],
333 mc_intel->hdr.date & 0xffff, 333 mc_intel->hdr.date & 0xffff,
334 mc_intel->hdr.date >> 24, 334 mc_intel->hdr.date >> 24,
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 712d15fdc416..71825806cd44 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -7,6 +7,8 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/pci.h> 8#include <linux/pci.h>
9#include <linux/dmi.h> 9#include <linux/dmi.h>
10#include <linux/range.h>
11
10#include <asm/pci-direct.h> 12#include <asm/pci-direct.h>
11#include <linux/sort.h> 13#include <linux/sort.h>
12#include <asm/io.h> 14#include <asm/io.h>
@@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
30 { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, 32 { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
31}; 33};
32 34
33struct range {
34 u64 start;
35 u64 end;
36};
37
38static int __cpuinit cmp_range(const void *x1, const void *x2) 35static int __cpuinit cmp_range(const void *x1, const void *x2)
39{ 36{
40 const struct range *r1 = x1; 37 const struct range *r1 = x1;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b1739d16310..1db183ed7c01 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -428,10 +428,6 @@ struct pv_mmu_ops pv_mmu_ops = {
428 .ptep_modify_prot_start = __ptep_modify_prot_start, 428 .ptep_modify_prot_start = __ptep_modify_prot_start,
429 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 429 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
430 430
431#ifdef CONFIG_HIGHPTE
432 .kmap_atomic_pte = kmap_atomic,
433#endif
434
435#if PAGETABLE_LEVELS >= 3 431#if PAGETABLE_LEVELS >= 3
436#ifdef CONFIG_X86_PAE 432#ifdef CONFIG_X86_PAE
437 .set_pte_atomic = native_set_pte_atomic, 433 .set_pte_atomic = native_set_pte_atomic,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 75e14e21f61a..1aa966c565f9 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -65,7 +65,7 @@ int dma_set_mask(struct device *dev, u64 mask)
65} 65}
66EXPORT_SYMBOL(dma_set_mask); 66EXPORT_SYMBOL(dma_set_mask);
67 67
68#ifdef CONFIG_X86_64 68#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
69static __initdata void *dma32_bootmem_ptr; 69static __initdata void *dma32_bootmem_ptr;
70static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); 70static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
71 71
@@ -116,14 +116,21 @@ static void __init dma32_free_bootmem(void)
116 dma32_bootmem_ptr = NULL; 116 dma32_bootmem_ptr = NULL;
117 dma32_bootmem_size = 0; 117 dma32_bootmem_size = 0;
118} 118}
119#else
120void __init dma32_reserve_bootmem(void)
121{
122}
123static void __init dma32_free_bootmem(void)
124{
125}
126
119#endif 127#endif
120 128
121void __init pci_iommu_alloc(void) 129void __init pci_iommu_alloc(void)
122{ 130{
123#ifdef CONFIG_X86_64
124 /* free the range so iommu could get some range less than 4G */ 131 /* free the range so iommu could get some range less than 4G */
125 dma32_free_bootmem(); 132 dma32_free_bootmem();
126#endif 133
127 if (pci_swiotlb_detect()) 134 if (pci_swiotlb_detect())
128 goto out; 135 goto out;
129 136
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c9b3522b6b46..02d678065d7d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -92,6 +92,13 @@ void exit_thread(void)
92 } 92 }
93} 93}
94 94
95void show_regs(struct pt_regs *regs)
96{
97 show_registers(regs);
98 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs),
99 regs->bp);
100}
101
95void show_regs_common(void) 102void show_regs_common(void)
96{ 103{
97 const char *board, *product; 104 const char *board, *product;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 37ad1e046aae..f6c62667e30c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -174,12 +174,6 @@ void __show_regs(struct pt_regs *regs, int all)
174 d6, d7); 174 d6, d7);
175} 175}
176 176
177void show_regs(struct pt_regs *regs)
178{
179 show_registers(regs);
180 show_trace(NULL, regs, &regs->sp, regs->bp);
181}
182
183void release_thread(struct task_struct *dead_task) 177void release_thread(struct task_struct *dead_task)
184{ 178{
185 BUG_ON(dead_task->mm); 179 BUG_ON(dead_task->mm);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 41a26a82470a..dc9690b4c4cc 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -211,12 +211,6 @@ void __show_regs(struct pt_regs *regs, int all)
211 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 211 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
212} 212}
213 213
214void show_regs(struct pt_regs *regs)
215{
216 show_registers(regs);
217 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
218}
219
220void release_thread(struct task_struct *dead_task) 214void release_thread(struct task_struct *dead_task)
221{ 215{
222 if (dead_task->mm) { 216 if (dead_task->mm) {
@@ -527,6 +521,7 @@ void set_personality_ia32(void)
527 521
528 /* Make sure to be in 32bit mode */ 522 /* Make sure to be in 32bit mode */
529 set_thread_flag(TIF_IA32); 523 set_thread_flag(TIF_IA32);
524 current->personality |= force_personality32;
530 525
531 /* Prepare the first "return" to user space */ 526 /* Prepare the first "return" to user space */
532 current_thread_info()->status |= TS_COMPAT; 527 current_thread_info()->status |= TS_COMPAT;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 017d937639fe..2d96aab82a48 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -48,6 +48,7 @@ enum x86_regset {
48 REGSET_FP, 48 REGSET_FP,
49 REGSET_XFP, 49 REGSET_XFP,
50 REGSET_IOPERM64 = REGSET_XFP, 50 REGSET_IOPERM64 = REGSET_XFP,
51 REGSET_XSTATE,
51 REGSET_TLS, 52 REGSET_TLS,
52 REGSET_IOPERM32, 53 REGSET_IOPERM32,
53}; 54};
@@ -140,30 +141,6 @@ static const int arg_offs_table[] = {
140#endif 141#endif
141}; 142};
142 143
143/**
144 * regs_get_argument_nth() - get Nth argument at function call
145 * @regs: pt_regs which contains registers at function entry.
146 * @n: argument number.
147 *
148 * regs_get_argument_nth() returns @n th argument of a function call.
149 * Since usually the kernel stack will be changed right after function entry,
150 * you must use this at function entry. If the @n th entry is NOT in the
151 * kernel stack or pt_regs, this returns 0.
152 */
153unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
154{
155 if (n < ARRAY_SIZE(arg_offs_table))
156 return *(unsigned long *)((char *)regs + arg_offs_table[n]);
157 else {
158 /*
159 * The typical case: arg n is on the stack.
160 * (Note: stack[0] = return address, so skip it)
161 */
162 n -= ARRAY_SIZE(arg_offs_table);
163 return regs_get_kernel_stack_nth(regs, 1 + n);
164 }
165}
166
167/* 144/*
168 * does not yet catch signals sent when the child dies. 145 * does not yet catch signals sent when the child dies.
169 * in exit.c or in signal.c. 146 * in exit.c or in signal.c.
@@ -702,7 +679,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
702 } else if (n == 6) { 679 } else if (n == 6) {
703 val = thread->debugreg6; 680 val = thread->debugreg6;
704 } else if (n == 7) { 681 } else if (n == 7) {
705 val = ptrace_get_dr7(thread->ptrace_bps); 682 val = thread->ptrace_dr7;
706 } 683 }
707 return val; 684 return val;
708} 685}
@@ -778,8 +755,11 @@ int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
778 return rc; 755 return rc;
779 } 756 }
780 /* All that's left is DR7 */ 757 /* All that's left is DR7 */
781 if (n == 7) 758 if (n == 7) {
782 rc = ptrace_write_dr7(tsk, val); 759 rc = ptrace_write_dr7(tsk, val);
760 if (!rc)
761 thread->ptrace_dr7 = val;
762 }
783 763
784ret_path: 764ret_path:
785 return rc; 765 return rc;
@@ -1584,7 +1564,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1584 1564
1585#ifdef CONFIG_X86_64 1565#ifdef CONFIG_X86_64
1586 1566
1587static const struct user_regset x86_64_regsets[] = { 1567static struct user_regset x86_64_regsets[] __read_mostly = {
1588 [REGSET_GENERAL] = { 1568 [REGSET_GENERAL] = {
1589 .core_note_type = NT_PRSTATUS, 1569 .core_note_type = NT_PRSTATUS,
1590 .n = sizeof(struct user_regs_struct) / sizeof(long), 1570 .n = sizeof(struct user_regs_struct) / sizeof(long),
@@ -1597,6 +1577,12 @@ static const struct user_regset x86_64_regsets[] = {
1597 .size = sizeof(long), .align = sizeof(long), 1577 .size = sizeof(long), .align = sizeof(long),
1598 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1578 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1599 }, 1579 },
1580 [REGSET_XSTATE] = {
1581 .core_note_type = NT_X86_XSTATE,
1582 .size = sizeof(u64), .align = sizeof(u64),
1583 .active = xstateregs_active, .get = xstateregs_get,
1584 .set = xstateregs_set
1585 },
1600 [REGSET_IOPERM64] = { 1586 [REGSET_IOPERM64] = {
1601 .core_note_type = NT_386_IOPERM, 1587 .core_note_type = NT_386_IOPERM,
1602 .n = IO_BITMAP_LONGS, 1588 .n = IO_BITMAP_LONGS,
@@ -1622,7 +1608,7 @@ static const struct user_regset_view user_x86_64_view = {
1622#endif /* CONFIG_X86_64 */ 1608#endif /* CONFIG_X86_64 */
1623 1609
1624#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 1610#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1625static const struct user_regset x86_32_regsets[] = { 1611static struct user_regset x86_32_regsets[] __read_mostly = {
1626 [REGSET_GENERAL] = { 1612 [REGSET_GENERAL] = {
1627 .core_note_type = NT_PRSTATUS, 1613 .core_note_type = NT_PRSTATUS,
1628 .n = sizeof(struct user_regs_struct32) / sizeof(u32), 1614 .n = sizeof(struct user_regs_struct32) / sizeof(u32),
@@ -1641,6 +1627,12 @@ static const struct user_regset x86_32_regsets[] = {
1641 .size = sizeof(u32), .align = sizeof(u32), 1627 .size = sizeof(u32), .align = sizeof(u32),
1642 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1628 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1643 }, 1629 },
1630 [REGSET_XSTATE] = {
1631 .core_note_type = NT_X86_XSTATE,
1632 .size = sizeof(u64), .align = sizeof(u64),
1633 .active = xstateregs_active, .get = xstateregs_get,
1634 .set = xstateregs_set
1635 },
1644 [REGSET_TLS] = { 1636 [REGSET_TLS] = {
1645 .core_note_type = NT_386_TLS, 1637 .core_note_type = NT_386_TLS,
1646 .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, 1638 .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
@@ -1663,6 +1655,23 @@ static const struct user_regset_view user_x86_32_view = {
1663}; 1655};
1664#endif 1656#endif
1665 1657
1658/*
1659 * This represents bytes 464..511 in the memory layout exported through
1660 * the REGSET_XSTATE interface.
1661 */
1662u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
1663
1664void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
1665{
1666#ifdef CONFIG_X86_64
1667 x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
1668#endif
1669#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1670 x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64);
1671#endif
1672 xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
1673}
1674
1666const struct user_regset_view *task_user_regset_view(struct task_struct *task) 1675const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1667{ 1676{
1668#ifdef CONFIG_IA32_EMULATION 1677#ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5d9e40c58628..5d7ba1a449bd 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -121,7 +121,9 @@
121unsigned long max_low_pfn_mapped; 121unsigned long max_low_pfn_mapped;
122unsigned long max_pfn_mapped; 122unsigned long max_pfn_mapped;
123 123
124#ifdef CONFIG_DMI
124RESERVE_BRK(dmi_alloc, 65536); 125RESERVE_BRK(dmi_alloc, 65536);
126#endif
125 127
126unsigned int boot_cpu_id __read_mostly; 128unsigned int boot_cpu_id __read_mostly;
127 129
@@ -667,6 +669,23 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
667 {} 669 {}
668}; 670};
669 671
672static void __init trim_bios_range(void)
673{
674 /*
675 * A special case is the first 4Kb of memory;
676 * This is a BIOS owned area, not kernel ram, but generally
677 * not listed as such in the E820 table.
678 */
679 e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
680 /*
681 * special case: Some BIOSen report the PC BIOS
682 * area (640->1Mb) as ram even though it is not.
683 * take them out.
684 */
685 e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
686 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
687}
688
670/* 689/*
671 * Determine if we were loaded by an EFI loader. If so, then we have also been 690 * Determine if we were loaded by an EFI loader. If so, then we have also been
672 * passed the efi memmap, systab, etc., so we should use these data structures 691 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -830,7 +849,7 @@ void __init setup_arch(char **cmdline_p)
830 insert_resource(&iomem_resource, &data_resource); 849 insert_resource(&iomem_resource, &data_resource);
831 insert_resource(&iomem_resource, &bss_resource); 850 insert_resource(&iomem_resource, &bss_resource);
832 851
833 852 trim_bios_range();
834#ifdef CONFIG_X86_32 853#ifdef CONFIG_X86_32
835 if (ppro_with_ram_bug()) { 854 if (ppro_with_ram_bug()) {
836 e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, 855 e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
@@ -950,15 +969,11 @@ void __init setup_arch(char **cmdline_p)
950#endif 969#endif
951 970
952 initmem_init(0, max_pfn, acpi, k8); 971 initmem_init(0, max_pfn, acpi, k8);
972#ifndef CONFIG_NO_BOOTMEM
973 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
974#endif
953 975
954#ifdef CONFIG_X86_64
955 /*
956 * dma32_reserve_bootmem() allocates bootmem which may conflict
957 * with the crashkernel command line, so do that after
958 * reserve_crashkernel()
959 */
960 dma32_reserve_bootmem(); 976 dma32_reserve_bootmem();
961#endif
962 977
963 reserve_ibft_region(); 978 reserve_ibft_region();
964 979
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 35abcb8b00e9..ef6370b00e70 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -137,7 +137,13 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
137 137
138static void __init pcpu_fc_free(void *ptr, size_t size) 138static void __init pcpu_fc_free(void *ptr, size_t size)
139{ 139{
140#ifdef CONFIG_NO_BOOTMEM
141 u64 start = __pa(ptr);
142 u64 end = start + size;
143 free_early_partial(start, end);
144#else
140 free_bootmem(__pa(ptr), size); 145 free_bootmem(__pa(ptr), size);
146#endif
141} 147}
142 148
143static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) 149static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 86f7edcd0438..a02e80c3c54b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -326,6 +326,7 @@ notrace static void __cpuinit start_secondary(void *unused)
326 unlock_vector_lock(); 326 unlock_vector_lock();
327 ipi_call_unlock(); 327 ipi_call_unlock();
328 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 328 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
329 x86_platform.nmi_init();
329 330
330 /* enable local interrupts */ 331 /* enable local interrupts */
331 local_irq_enable(); 332 local_irq_enable();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 33399176512a..1168e4454188 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -534,6 +534,9 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
534 534
535 get_debugreg(dr6, 6); 535 get_debugreg(dr6, 6);
536 536
537 /* Filter out all the reserved bits which are preset to 1 */
538 dr6 &= ~DR6_RESERVED;
539
537 /* Catch kmemcheck conditions first of all! */ 540 /* Catch kmemcheck conditions first of all! */
538 if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) 541 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
539 return; 542 return;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 597683aa5ba0..208a857c679f 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -740,7 +740,7 @@ static cycle_t __vsyscall_fn vread_tsc(void)
740} 740}
741#endif 741#endif
742 742
743static void resume_tsc(void) 743static void resume_tsc(struct clocksource *cs)
744{ 744{
745 clocksource_tsc.cycle_last = 0; 745 clocksource_tsc.cycle_last = 0;
746} 746}
@@ -806,7 +806,7 @@ static void __init check_system_tsc_reliable(void)
806 unsigned long res_low, res_high; 806 unsigned long res_low, res_high;
807 807
808 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); 808 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
809 /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */ 809 /* Geode_LX - the OLPC CPU has a very reliable TSC */
810 if (res_low & RTSC_SUSP) 810 if (res_low & RTSC_SUSP)
811 tsc_clocksource_reliable = 1; 811 tsc_clocksource_reliable = 1;
812#endif 812#endif
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
index 36afb98675a4..309c70fb7759 100644
--- a/arch/x86/kernel/uv_sysfs.c
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -54,19 +54,19 @@ static int __init sgi_uv_sysfs_init(void)
54 if (!sgi_uv_kobj) 54 if (!sgi_uv_kobj)
55 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); 55 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
56 if (!sgi_uv_kobj) { 56 if (!sgi_uv_kobj) {
57 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); 57 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n");
58 return -EINVAL; 58 return -EINVAL;
59 } 59 }
60 60
61 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); 61 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
62 if (ret) { 62 if (ret) {
63 printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); 63 printk(KERN_WARNING "sysfs_create_file partition_id failed\n");
64 return ret; 64 return ret;
65 } 65 }
66 66
67 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); 67 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
68 if (ret) { 68 if (ret) {
69 printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); 69 printk(KERN_WARNING "sysfs_create_file coherence_id failed\n");
70 return ret; 70 return ret;
71 } 71 }
72 72
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index d430e4c30193..7dd599deca4a 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -33,6 +33,7 @@
33#include <asm/fixmap.h> 33#include <asm/fixmap.h>
34#include <asm/apicdef.h> 34#include <asm/apicdef.h>
35#include <asm/apic.h> 35#include <asm/apic.h>
36#include <asm/pgalloc.h>
36#include <asm/processor.h> 37#include <asm/processor.h>
37#include <asm/timer.h> 38#include <asm/timer.h>
38#include <asm/vmi_time.h> 39#include <asm/vmi_time.h>
@@ -266,30 +267,6 @@ static void vmi_nop(void)
266{ 267{
267} 268}
268 269
269#ifdef CONFIG_HIGHPTE
270static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
271{
272 void *va = kmap_atomic(page, type);
273
274 /*
275 * Internally, the VMI ROM must map virtual addresses to physical
276 * addresses for processing MMU updates. By the time MMU updates
277 * are issued, this information is typically already lost.
278 * Fortunately, the VMI provides a cache of mapping slots for active
279 * page tables.
280 *
281 * We use slot zero for the linear mapping of physical memory, and
282 * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
283 *
284 * args: SLOT VA COUNT PFN
285 */
286 BUG_ON(type != KM_PTE0 && type != KM_PTE1);
287 vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
288
289 return va;
290}
291#endif
292
293static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) 270static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
294{ 271{
295 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 272 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@@ -640,6 +617,12 @@ static inline int __init activate_vmi(void)
640 u64 reloc; 617 u64 reloc;
641 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; 618 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
642 619
620 /*
621 * Prevent page tables from being allocated in highmem, even if
622 * CONFIG_HIGHPTE is enabled.
623 */
624 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
625
643 if (call_vrom_func(vmi_rom, vmi_init) != 0) { 626 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
644 printk(KERN_ERR "VMI ROM failed to initialize!"); 627 printk(KERN_ERR "VMI ROM failed to initialize!");
645 return 0; 628 return 0;
@@ -778,10 +761,6 @@ static inline int __init activate_vmi(void)
778 761
779 /* Set linear is needed in all cases */ 762 /* Set linear is needed in all cases */
780 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); 763 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
781#ifdef CONFIG_HIGHPTE
782 if (vmi_ops.set_linear_mapping)
783 pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
784#endif
785 764
786 /* 765 /*
787 * These MUST always be patched. Don't support indirect jumps 766 * These MUST always be patched. Don't support indirect jumps
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index f92a0da608cb..44879df55696 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -341,7 +341,7 @@ SECTIONS
341 * Per-cpu symbols which need to be offset from __per_cpu_load 341 * Per-cpu symbols which need to be offset from __per_cpu_load
342 * for the boot processor. 342 * for the boot processor.
343 */ 343 */
344#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load 344#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
345INIT_PER_CPU(gdt_page); 345INIT_PER_CPU(gdt_page);
346INIT_PER_CPU(irq_stack_union); 346INIT_PER_CPU(irq_stack_union);
347 347
@@ -352,7 +352,7 @@ INIT_PER_CPU(irq_stack_union);
352 "kernel image bigger than KERNEL_IMAGE_SIZE"); 352 "kernel image bigger than KERNEL_IMAGE_SIZE");
353 353
354#ifdef CONFIG_SMP 354#ifdef CONFIG_SMP
355. = ASSERT((per_cpu__irq_stack_union == 0), 355. = ASSERT((irq_stack_union == 0),
356 "irq_stack_union is not at start of per-cpu area"); 356 "irq_stack_union is not at start of per-cpu area");
357#endif 357#endif
358 358
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 9055e5872ff0..1c0c6ab9c60f 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -301,7 +301,8 @@ static int __init vsyscall_init(void)
301 register_sysctl_table(kernel_root_table2); 301 register_sysctl_table(kernel_root_table2);
302#endif 302#endif
303 on_each_cpu(cpu_vsyscall_init, NULL, 1); 303 on_each_cpu(cpu_vsyscall_init, NULL, 1);
304 hotcpu_notifier(cpu_vsyscall_notifier, 0); 304 /* notifier priority > KVM */
305 hotcpu_notifier(cpu_vsyscall_notifier, 30);
305 return 0; 306 return 0;
306} 307}
307 308
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 619f7f88b8cc..693920b22496 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -26,7 +26,8 @@ EXPORT_SYMBOL(__put_user_2);
26EXPORT_SYMBOL(__put_user_4); 26EXPORT_SYMBOL(__put_user_4);
27EXPORT_SYMBOL(__put_user_8); 27EXPORT_SYMBOL(__put_user_8);
28 28
29EXPORT_SYMBOL(copy_user_generic); 29EXPORT_SYMBOL(copy_user_generic_string);
30EXPORT_SYMBOL(copy_user_generic_unrolled);
30EXPORT_SYMBOL(__copy_user_nocache); 31EXPORT_SYMBOL(__copy_user_nocache);
31EXPORT_SYMBOL(_copy_from_user); 32EXPORT_SYMBOL(_copy_from_user);
32EXPORT_SYMBOL(_copy_to_user); 33EXPORT_SYMBOL(_copy_to_user);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 1817cd7a03fa..61a1e8c7e19f 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -84,10 +84,13 @@ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
84 .setup_percpu_clockev = setup_secondary_APIC_clock, 84 .setup_percpu_clockev = setup_secondary_APIC_clock,
85}; 85};
86 86
87static void default_nmi_init(void) { };
88
87struct x86_platform_ops x86_platform = { 89struct x86_platform_ops x86_platform = {
88 .calibrate_tsc = native_calibrate_tsc, 90 .calibrate_tsc = native_calibrate_tsc,
89 .get_wallclock = mach_get_cmos_time, 91 .get_wallclock = mach_get_cmos_time,
90 .set_wallclock = mach_set_rtc_mmss, 92 .set_wallclock = mach_set_rtc_mmss,
91 .iommu_shutdown = iommu_shutdown_noop, 93 .iommu_shutdown = iommu_shutdown_noop,
92 .is_untracked_pat_range = is_ISA_range, 94 .is_untracked_pat_range = is_ISA_range,
95 .nmi_init = default_nmi_init
93}; 96};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index c5ee17e8c6d9..782c3a362ec6 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -337,6 +337,7 @@ void __ref xsave_cntxt_init(void)
337 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); 337 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
338 xstate_size = ebx; 338 xstate_size = ebx;
339 339
340 update_regset_xstate_info(xstate_size, pcntxt_mask);
340 prepare_fx_sw_frame(); 341 prepare_fx_sw_frame();
341 342
342 setup_xstate_init(); 343 setup_xstate_init();
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 4cd498332466..970bbd479516 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -29,6 +29,7 @@ config KVM
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select USER_RETURN_NOTIFIER 31 select USER_RETURN_NOTIFIER
32 select KVM_MMIO
32 ---help--- 33 ---help---
33 Support hosting fully virtualized guest machines using hardware 34 Support hosting fully virtualized guest machines using hardware
34 virtualization extensions. You will need a fairly recent 35 virtualization extensions. You will need a fairly recent
@@ -65,6 +66,7 @@ config KVM_AMD
65 66
66# OK, it's a little counter-intuitive to do this, but it puts it neatly under 67# OK, it's a little counter-intuitive to do this, but it puts it neatly under
67# the virtualization menu. 68# the virtualization menu.
69source drivers/vhost/Kconfig
68source drivers/lguest/Kconfig 70source drivers/lguest/Kconfig
69source drivers/virtio/Kconfig 71source drivers/virtio/Kconfig
70 72
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7e8faea4651e..4dade6ac0827 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -32,7 +32,7 @@
32#include <linux/module.h> 32#include <linux/module.h>
33#include <asm/kvm_emulate.h> 33#include <asm/kvm_emulate.h>
34 34
35#include "mmu.h" /* for is_long_mode() */ 35#include "x86.h"
36 36
37/* 37/*
38 * Opcode effective-address decode tables. 38 * Opcode effective-address decode tables.
@@ -76,6 +76,8 @@
76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
77#define GroupMask 0xff /* Group number stored in bits 0:7 */ 77#define GroupMask 0xff /* Group number stored in bits 0:7 */
78/* Misc flags */ 78/* Misc flags */
79#define Lock (1<<26) /* lock prefix is allowed for the instruction */
80#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
79#define No64 (1<<28) 81#define No64 (1<<28)
80/* Source 2 operand type */ 82/* Source 2 operand type */
81#define Src2None (0<<29) 83#define Src2None (0<<29)
@@ -88,39 +90,40 @@
88enum { 90enum {
89 Group1_80, Group1_81, Group1_82, Group1_83, 91 Group1_80, Group1_81, Group1_82, Group1_83,
90 Group1A, Group3_Byte, Group3, Group4, Group5, Group7, 92 Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
93 Group8, Group9,
91}; 94};
92 95
93static u32 opcode_table[256] = { 96static u32 opcode_table[256] = {
94 /* 0x00 - 0x07 */ 97 /* 0x00 - 0x07 */
95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 98 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 99 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
97 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 100 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
98 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, 101 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
99 /* 0x08 - 0x0F */ 102 /* 0x08 - 0x0F */
100 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 103 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
101 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 104 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
102 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 105 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
103 ImplicitOps | Stack | No64, 0, 106 ImplicitOps | Stack | No64, 0,
104 /* 0x10 - 0x17 */ 107 /* 0x10 - 0x17 */
105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 108 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 109 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 110 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
108 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, 111 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
109 /* 0x18 - 0x1F */ 112 /* 0x18 - 0x1F */
110 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 113 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
111 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 114 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
112 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 115 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
113 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, 116 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
114 /* 0x20 - 0x27 */ 117 /* 0x20 - 0x27 */
115 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 118 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
116 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 119 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
117 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, 120 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
118 /* 0x28 - 0x2F */ 121 /* 0x28 - 0x2F */
119 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 122 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
120 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 123 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
121 0, 0, 0, 0, 124 0, 0, 0, 0,
122 /* 0x30 - 0x37 */ 125 /* 0x30 - 0x37 */
123 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 126 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
124 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 127 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
125 0, 0, 0, 0, 128 0, 0, 0, 0,
126 /* 0x38 - 0x3F */ 129 /* 0x38 - 0x3F */
@@ -156,7 +159,7 @@ static u32 opcode_table[256] = {
156 Group | Group1_80, Group | Group1_81, 159 Group | Group1_80, Group | Group1_81,
157 Group | Group1_82, Group | Group1_83, 160 Group | Group1_82, Group | Group1_83,
158 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 161 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
159 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 162 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
160 /* 0x88 - 0x8F */ 163 /* 0x88 - 0x8F */
161 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 164 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
162 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 165 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -210,7 +213,7 @@ static u32 opcode_table[256] = {
210 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 213 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
211 /* 0xF0 - 0xF7 */ 214 /* 0xF0 - 0xF7 */
212 0, 0, 0, 0, 215 0, 0, 0, 0,
213 ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, 216 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
214 /* 0xF8 - 0xFF */ 217 /* 0xF8 - 0xFF */
215 ImplicitOps, 0, ImplicitOps, ImplicitOps, 218 ImplicitOps, 0, ImplicitOps, ImplicitOps,
216 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, 219 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
@@ -218,16 +221,20 @@ static u32 opcode_table[256] = {
218 221
219static u32 twobyte_table[256] = { 222static u32 twobyte_table[256] = {
220 /* 0x00 - 0x0F */ 223 /* 0x00 - 0x0F */
221 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 224 0, Group | GroupDual | Group7, 0, 0,
222 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 225 0, ImplicitOps, ImplicitOps | Priv, 0,
226 ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
227 0, ImplicitOps | ModRM, 0, 0,
223 /* 0x10 - 0x1F */ 228 /* 0x10 - 0x1F */
224 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 229 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
225 /* 0x20 - 0x2F */ 230 /* 0x20 - 0x2F */
226 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, 231 ModRM | ImplicitOps | Priv, ModRM | Priv,
232 ModRM | ImplicitOps | Priv, ModRM | Priv,
233 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0, 234 0, 0, 0, 0, 0, 0, 0, 0,
228 /* 0x30 - 0x3F */ 235 /* 0x30 - 0x3F */
229 ImplicitOps, 0, ImplicitOps, 0, 236 ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
230 ImplicitOps, ImplicitOps, 0, 0, 237 ImplicitOps, ImplicitOps | Priv, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0, 238 0, 0, 0, 0, 0, 0, 0, 0,
232 /* 0x40 - 0x47 */ 239 /* 0x40 - 0x47 */
233 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 240 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -257,21 +264,23 @@ static u32 twobyte_table[256] = {
257 DstMem | SrcReg | Src2CL | ModRM, 0, 0, 264 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
258 /* 0xA8 - 0xAF */ 265 /* 0xA8 - 0xAF */
259 ImplicitOps | Stack, ImplicitOps | Stack, 266 ImplicitOps | Stack, ImplicitOps | Stack,
260 0, DstMem | SrcReg | ModRM | BitOp, 267 0, DstMem | SrcReg | ModRM | BitOp | Lock,
261 DstMem | SrcReg | Src2ImmByte | ModRM, 268 DstMem | SrcReg | Src2ImmByte | ModRM,
262 DstMem | SrcReg | Src2CL | ModRM, 269 DstMem | SrcReg | Src2CL | ModRM,
263 ModRM, 0, 270 ModRM, 0,
264 /* 0xB0 - 0xB7 */ 271 /* 0xB0 - 0xB7 */
265 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, 272 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
266 DstMem | SrcReg | ModRM | BitOp, 273 0, DstMem | SrcReg | ModRM | BitOp | Lock,
267 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, 274 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
268 DstReg | SrcMem16 | ModRM | Mov, 275 DstReg | SrcMem16 | ModRM | Mov,
269 /* 0xB8 - 0xBF */ 276 /* 0xB8 - 0xBF */
270 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, 277 0, 0,
278 Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
271 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, 279 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
272 DstReg | SrcMem16 | ModRM | Mov, 280 DstReg | SrcMem16 | ModRM | Mov,
273 /* 0xC0 - 0xCF */ 281 /* 0xC0 - 0xCF */
274 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, 282 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
283 0, 0, 0, Group | GroupDual | Group9,
275 0, 0, 0, 0, 0, 0, 0, 0, 284 0, 0, 0, 0, 0, 0, 0, 0,
276 /* 0xD0 - 0xDF */ 285 /* 0xD0 - 0xDF */
277 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 286 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -283,25 +292,41 @@ static u32 twobyte_table[256] = {
283 292
284static u32 group_table[] = { 293static u32 group_table[] = {
285 [Group1_80*8] = 294 [Group1_80*8] =
286 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 295 ByteOp | DstMem | SrcImm | ModRM | Lock,
287 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 296 ByteOp | DstMem | SrcImm | ModRM | Lock,
288 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 297 ByteOp | DstMem | SrcImm | ModRM | Lock,
289 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 298 ByteOp | DstMem | SrcImm | ModRM | Lock,
299 ByteOp | DstMem | SrcImm | ModRM | Lock,
300 ByteOp | DstMem | SrcImm | ModRM | Lock,
301 ByteOp | DstMem | SrcImm | ModRM | Lock,
302 ByteOp | DstMem | SrcImm | ModRM,
290 [Group1_81*8] = 303 [Group1_81*8] =
291 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 304 DstMem | SrcImm | ModRM | Lock,
292 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 305 DstMem | SrcImm | ModRM | Lock,
293 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 306 DstMem | SrcImm | ModRM | Lock,
294 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 307 DstMem | SrcImm | ModRM | Lock,
308 DstMem | SrcImm | ModRM | Lock,
309 DstMem | SrcImm | ModRM | Lock,
310 DstMem | SrcImm | ModRM | Lock,
311 DstMem | SrcImm | ModRM,
295 [Group1_82*8] = 312 [Group1_82*8] =
296 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 313 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
297 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 314 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
298 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 315 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
299 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 316 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
317 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
318 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
319 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
320 ByteOp | DstMem | SrcImm | ModRM | No64,
300 [Group1_83*8] = 321 [Group1_83*8] =
301 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 322 DstMem | SrcImmByte | ModRM | Lock,
302 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 323 DstMem | SrcImmByte | ModRM | Lock,
303 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 324 DstMem | SrcImmByte | ModRM | Lock,
304 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 325 DstMem | SrcImmByte | ModRM | Lock,
326 DstMem | SrcImmByte | ModRM | Lock,
327 DstMem | SrcImmByte | ModRM | Lock,
328 DstMem | SrcImmByte | ModRM | Lock,
329 DstMem | SrcImmByte | ModRM,
305 [Group1A*8] = 330 [Group1A*8] =
306 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, 331 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
307 [Group3_Byte*8] = 332 [Group3_Byte*8] =
@@ -320,24 +345,39 @@ static u32 group_table[] = {
320 SrcMem | ModRM | Stack, 0, 345 SrcMem | ModRM | Stack, 0,
321 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, 346 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
322 [Group7*8] = 347 [Group7*8] =
323 0, 0, ModRM | SrcMem, ModRM | SrcMem, 348 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
324 SrcNone | ModRM | DstMem | Mov, 0, 349 SrcNone | ModRM | DstMem | Mov, 0,
325 SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, 350 SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
351 [Group8*8] =
352 0, 0, 0, 0,
353 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
354 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
355 [Group9*8] =
356 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0,
326}; 357};
327 358
328static u32 group2_table[] = { 359static u32 group2_table[] = {
329 [Group7*8] = 360 [Group7*8] =
330 SrcNone | ModRM, 0, 0, SrcNone | ModRM, 361 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM,
331 SrcNone | ModRM | DstMem | Mov, 0, 362 SrcNone | ModRM | DstMem | Mov, 0,
332 SrcMem16 | ModRM | Mov, 0, 363 SrcMem16 | ModRM | Mov, 0,
364 [Group9*8] =
365 0, 0, 0, 0, 0, 0, 0, 0,
333}; 366};
334 367
335/* EFLAGS bit definitions. */ 368/* EFLAGS bit definitions. */
369#define EFLG_ID (1<<21)
370#define EFLG_VIP (1<<20)
371#define EFLG_VIF (1<<19)
372#define EFLG_AC (1<<18)
336#define EFLG_VM (1<<17) 373#define EFLG_VM (1<<17)
337#define EFLG_RF (1<<16) 374#define EFLG_RF (1<<16)
375#define EFLG_IOPL (3<<12)
376#define EFLG_NT (1<<14)
338#define EFLG_OF (1<<11) 377#define EFLG_OF (1<<11)
339#define EFLG_DF (1<<10) 378#define EFLG_DF (1<<10)
340#define EFLG_IF (1<<9) 379#define EFLG_IF (1<<9)
380#define EFLG_TF (1<<8)
341#define EFLG_SF (1<<7) 381#define EFLG_SF (1<<7)
342#define EFLG_ZF (1<<6) 382#define EFLG_ZF (1<<6)
343#define EFLG_AF (1<<4) 383#define EFLG_AF (1<<4)
@@ -606,7 +646,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
606 646
607 if (linear < fc->start || linear >= fc->end) { 647 if (linear < fc->start || linear >= fc->end) {
608 size = min(15UL, PAGE_SIZE - offset_in_page(linear)); 648 size = min(15UL, PAGE_SIZE - offset_in_page(linear));
609 rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); 649 rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
610 if (rc) 650 if (rc)
611 return rc; 651 return rc;
612 fc->start = linear; 652 fc->start = linear;
@@ -661,11 +701,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
661 op_bytes = 3; 701 op_bytes = 3;
662 *address = 0; 702 *address = 0;
663 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, 703 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
664 ctxt->vcpu); 704 ctxt->vcpu, NULL);
665 if (rc) 705 if (rc)
666 return rc; 706 return rc;
667 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, 707 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
668 ctxt->vcpu); 708 ctxt->vcpu, NULL);
669 return rc; 709 return rc;
670} 710}
671 711
@@ -889,6 +929,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
889 929
890 switch (mode) { 930 switch (mode) {
891 case X86EMUL_MODE_REAL: 931 case X86EMUL_MODE_REAL:
932 case X86EMUL_MODE_VM86:
892 case X86EMUL_MODE_PROT16: 933 case X86EMUL_MODE_PROT16:
893 def_op_bytes = def_ad_bytes = 2; 934 def_op_bytes = def_ad_bytes = 2;
894 break; 935 break;
@@ -975,7 +1016,7 @@ done_prefixes:
975 } 1016 }
976 1017
977 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 1018 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
978 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");; 1019 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
979 return -1; 1020 return -1;
980 } 1021 }
981 1022
@@ -1196,13 +1237,56 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1196 rc = ops->read_emulated(register_address(c, ss_base(ctxt), 1237 rc = ops->read_emulated(register_address(c, ss_base(ctxt),
1197 c->regs[VCPU_REGS_RSP]), 1238 c->regs[VCPU_REGS_RSP]),
1198 dest, len, ctxt->vcpu); 1239 dest, len, ctxt->vcpu);
1199 if (rc != 0) 1240 if (rc != X86EMUL_CONTINUE)
1200 return rc; 1241 return rc;
1201 1242
1202 register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); 1243 register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
1203 return rc; 1244 return rc;
1204} 1245}
1205 1246
1247static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1248 struct x86_emulate_ops *ops,
1249 void *dest, int len)
1250{
1251 int rc;
1252 unsigned long val, change_mask;
1253 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1254 int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu);
1255
1256 rc = emulate_pop(ctxt, ops, &val, len);
1257 if (rc != X86EMUL_CONTINUE)
1258 return rc;
1259
1260 change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
1261 | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
1262
1263 switch(ctxt->mode) {
1264 case X86EMUL_MODE_PROT64:
1265 case X86EMUL_MODE_PROT32:
1266 case X86EMUL_MODE_PROT16:
1267 if (cpl == 0)
1268 change_mask |= EFLG_IOPL;
1269 if (cpl <= iopl)
1270 change_mask |= EFLG_IF;
1271 break;
1272 case X86EMUL_MODE_VM86:
1273 if (iopl < 3) {
1274 kvm_inject_gp(ctxt->vcpu, 0);
1275 return X86EMUL_PROPAGATE_FAULT;
1276 }
1277 change_mask |= EFLG_IF;
1278 break;
1279 default: /* real mode */
1280 change_mask |= (EFLG_IOPL | EFLG_IF);
1281 break;
1282 }
1283
1284 *(unsigned long *)dest =
1285 (ctxt->eflags & ~change_mask) | (val & change_mask);
1286
1287 return rc;
1288}
1289
1206static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) 1290static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1207{ 1291{
1208 struct decode_cache *c = &ctxt->decode; 1292 struct decode_cache *c = &ctxt->decode;
@@ -1225,7 +1309,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1225 if (rc != 0) 1309 if (rc != 0)
1226 return rc; 1310 return rc;
1227 1311
1228 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg); 1312 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);
1229 return rc; 1313 return rc;
1230} 1314}
1231 1315
@@ -1370,7 +1454,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1370 int rc; 1454 int rc;
1371 1455
1372 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); 1456 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1373 if (rc != 0) 1457 if (rc != X86EMUL_CONTINUE)
1374 return rc; 1458 return rc;
1375 1459
1376 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || 1460 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
@@ -1385,7 +1469,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1385 (u32) c->regs[VCPU_REGS_RBX]; 1469 (u32) c->regs[VCPU_REGS_RBX];
1386 1470
1387 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); 1471 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1388 if (rc != 0) 1472 if (rc != X86EMUL_CONTINUE)
1389 return rc; 1473 return rc;
1390 ctxt->eflags |= EFLG_ZF; 1474 ctxt->eflags |= EFLG_ZF;
1391 } 1475 }
@@ -1407,7 +1491,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1407 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1491 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1408 if (rc) 1492 if (rc)
1409 return rc; 1493 return rc;
1410 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); 1494 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);
1411 return rc; 1495 return rc;
1412} 1496}
1413 1497
@@ -1451,7 +1535,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1451 &c->dst.val, 1535 &c->dst.val,
1452 c->dst.bytes, 1536 c->dst.bytes,
1453 ctxt->vcpu); 1537 ctxt->vcpu);
1454 if (rc != 0) 1538 if (rc != X86EMUL_CONTINUE)
1455 return rc; 1539 return rc;
1456 break; 1540 break;
1457 case OP_NONE: 1541 case OP_NONE:
@@ -1514,9 +1598,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1514 u64 msr_data; 1598 u64 msr_data;
1515 1599
1516 /* syscall is not available in real mode */ 1600 /* syscall is not available in real mode */
1517 if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL 1601 if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86)
1518 || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) 1602 return X86EMUL_UNHANDLEABLE;
1519 return -1;
1520 1603
1521 setup_syscalls_segments(ctxt, &cs, &ss); 1604 setup_syscalls_segments(ctxt, &cs, &ss);
1522 1605
@@ -1553,7 +1636,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1553 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1636 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1554 } 1637 }
1555 1638
1556 return 0; 1639 return X86EMUL_CONTINUE;
1557} 1640}
1558 1641
1559static int 1642static int
@@ -1563,22 +1646,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1563 struct kvm_segment cs, ss; 1646 struct kvm_segment cs, ss;
1564 u64 msr_data; 1647 u64 msr_data;
1565 1648
1566 /* inject #UD if LOCK prefix is used */ 1649 /* inject #GP if in real mode */
1567 if (c->lock_prefix) 1650 if (ctxt->mode == X86EMUL_MODE_REAL) {
1568 return -1;
1569
1570 /* inject #GP if in real mode or paging is disabled */
1571 if (ctxt->mode == X86EMUL_MODE_REAL ||
1572 !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
1573 kvm_inject_gp(ctxt->vcpu, 0); 1651 kvm_inject_gp(ctxt->vcpu, 0);
1574 return -1; 1652 return X86EMUL_UNHANDLEABLE;
1575 } 1653 }
1576 1654
1577 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1655 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1578 * Therefore, we inject an #UD. 1656 * Therefore, we inject an #UD.
1579 */ 1657 */
1580 if (ctxt->mode == X86EMUL_MODE_PROT64) 1658 if (ctxt->mode == X86EMUL_MODE_PROT64)
1581 return -1; 1659 return X86EMUL_UNHANDLEABLE;
1582 1660
1583 setup_syscalls_segments(ctxt, &cs, &ss); 1661 setup_syscalls_segments(ctxt, &cs, &ss);
1584 1662
@@ -1587,13 +1665,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1587 case X86EMUL_MODE_PROT32: 1665 case X86EMUL_MODE_PROT32:
1588 if ((msr_data & 0xfffc) == 0x0) { 1666 if ((msr_data & 0xfffc) == 0x0) {
1589 kvm_inject_gp(ctxt->vcpu, 0); 1667 kvm_inject_gp(ctxt->vcpu, 0);
1590 return -1; 1668 return X86EMUL_PROPAGATE_FAULT;
1591 } 1669 }
1592 break; 1670 break;
1593 case X86EMUL_MODE_PROT64: 1671 case X86EMUL_MODE_PROT64:
1594 if (msr_data == 0x0) { 1672 if (msr_data == 0x0) {
1595 kvm_inject_gp(ctxt->vcpu, 0); 1673 kvm_inject_gp(ctxt->vcpu, 0);
1596 return -1; 1674 return X86EMUL_PROPAGATE_FAULT;
1597 } 1675 }
1598 break; 1676 break;
1599 } 1677 }
@@ -1618,7 +1696,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1618 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); 1696 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
1619 c->regs[VCPU_REGS_RSP] = msr_data; 1697 c->regs[VCPU_REGS_RSP] = msr_data;
1620 1698
1621 return 0; 1699 return X86EMUL_CONTINUE;
1622} 1700}
1623 1701
1624static int 1702static int
@@ -1629,21 +1707,11 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1629 u64 msr_data; 1707 u64 msr_data;
1630 int usermode; 1708 int usermode;
1631 1709
1632 /* inject #UD if LOCK prefix is used */ 1710 /* inject #GP if in real mode or Virtual 8086 mode */
1633 if (c->lock_prefix) 1711 if (ctxt->mode == X86EMUL_MODE_REAL ||
1634 return -1; 1712 ctxt->mode == X86EMUL_MODE_VM86) {
1635
1636 /* inject #GP if in real mode or paging is disabled */
1637 if (ctxt->mode == X86EMUL_MODE_REAL
1638 || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
1639 kvm_inject_gp(ctxt->vcpu, 0);
1640 return -1;
1641 }
1642
1643 /* sysexit must be called from CPL 0 */
1644 if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
1645 kvm_inject_gp(ctxt->vcpu, 0); 1713 kvm_inject_gp(ctxt->vcpu, 0);
1646 return -1; 1714 return X86EMUL_UNHANDLEABLE;
1647 } 1715 }
1648 1716
1649 setup_syscalls_segments(ctxt, &cs, &ss); 1717 setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1661,7 +1729,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1661 cs.selector = (u16)(msr_data + 16); 1729 cs.selector = (u16)(msr_data + 16);
1662 if ((msr_data & 0xfffc) == 0x0) { 1730 if ((msr_data & 0xfffc) == 0x0) {
1663 kvm_inject_gp(ctxt->vcpu, 0); 1731 kvm_inject_gp(ctxt->vcpu, 0);
1664 return -1; 1732 return X86EMUL_PROPAGATE_FAULT;
1665 } 1733 }
1666 ss.selector = (u16)(msr_data + 24); 1734 ss.selector = (u16)(msr_data + 24);
1667 break; 1735 break;
@@ -1669,7 +1737,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1669 cs.selector = (u16)(msr_data + 32); 1737 cs.selector = (u16)(msr_data + 32);
1670 if (msr_data == 0x0) { 1738 if (msr_data == 0x0) {
1671 kvm_inject_gp(ctxt->vcpu, 0); 1739 kvm_inject_gp(ctxt->vcpu, 0);
1672 return -1; 1740 return X86EMUL_PROPAGATE_FAULT;
1673 } 1741 }
1674 ss.selector = cs.selector + 8; 1742 ss.selector = cs.selector + 8;
1675 cs.db = 0; 1743 cs.db = 0;
@@ -1685,7 +1753,58 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1685 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; 1753 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
1686 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; 1754 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
1687 1755
1688 return 0; 1756 return X86EMUL_CONTINUE;
1757}
1758
1759static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
1760{
1761 int iopl;
1762 if (ctxt->mode == X86EMUL_MODE_REAL)
1763 return false;
1764 if (ctxt->mode == X86EMUL_MODE_VM86)
1765 return true;
1766 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1767 return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl;
1768}
1769
1770static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
1771 struct x86_emulate_ops *ops,
1772 u16 port, u16 len)
1773{
1774 struct kvm_segment tr_seg;
1775 int r;
1776 u16 io_bitmap_ptr;
1777 u8 perm, bit_idx = port & 0x7;
1778 unsigned mask = (1 << len) - 1;
1779
1780 kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
1781 if (tr_seg.unusable)
1782 return false;
1783 if (tr_seg.limit < 103)
1784 return false;
1785 r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
1786 NULL);
1787 if (r != X86EMUL_CONTINUE)
1788 return false;
1789 if (io_bitmap_ptr + port/8 > tr_seg.limit)
1790 return false;
1791 r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
1792 ctxt->vcpu, NULL);
1793 if (r != X86EMUL_CONTINUE)
1794 return false;
1795 if ((perm >> bit_idx) & mask)
1796 return false;
1797 return true;
1798}
1799
1800static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
1801 struct x86_emulate_ops *ops,
1802 u16 port, u16 len)
1803{
1804 if (emulator_bad_iopl(ctxt))
1805 if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
1806 return false;
1807 return true;
1689} 1808}
1690 1809
1691int 1810int
@@ -1709,6 +1828,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1709 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 1828 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1710 saved_eip = c->eip; 1829 saved_eip = c->eip;
1711 1830
1831 /* LOCK prefix is allowed only with some instructions */
1832 if (c->lock_prefix && !(c->d & Lock)) {
1833 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1834 goto done;
1835 }
1836
1837 /* Privileged instruction can be executed only in CPL=0 */
1838 if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) {
1839 kvm_inject_gp(ctxt->vcpu, 0);
1840 goto done;
1841 }
1842
1712 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) 1843 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1713 memop = c->modrm_ea; 1844 memop = c->modrm_ea;
1714 1845
@@ -1749,7 +1880,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1749 &c->src.val, 1880 &c->src.val,
1750 c->src.bytes, 1881 c->src.bytes,
1751 ctxt->vcpu); 1882 ctxt->vcpu);
1752 if (rc != 0) 1883 if (rc != X86EMUL_CONTINUE)
1753 goto done; 1884 goto done;
1754 c->src.orig_val = c->src.val; 1885 c->src.orig_val = c->src.val;
1755 } 1886 }
@@ -1768,12 +1899,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1768 c->dst.ptr = (void *)c->dst.ptr + 1899 c->dst.ptr = (void *)c->dst.ptr +
1769 (c->src.val & mask) / 8; 1900 (c->src.val & mask) / 8;
1770 } 1901 }
1771 if (!(c->d & Mov) && 1902 if (!(c->d & Mov)) {
1772 /* optimisation - avoid slow emulated read */ 1903 /* optimisation - avoid slow emulated read */
1773 ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 1904 rc = ops->read_emulated((unsigned long)c->dst.ptr,
1774 &c->dst.val, 1905 &c->dst.val,
1775 c->dst.bytes, ctxt->vcpu)) != 0)) 1906 c->dst.bytes,
1776 goto done; 1907 ctxt->vcpu);
1908 if (rc != X86EMUL_CONTINUE)
1909 goto done;
1910 }
1777 } 1911 }
1778 c->dst.orig_val = c->dst.val; 1912 c->dst.orig_val = c->dst.val;
1779 1913
@@ -1876,7 +2010,12 @@ special_insn:
1876 break; 2010 break;
1877 case 0x6c: /* insb */ 2011 case 0x6c: /* insb */
1878 case 0x6d: /* insw/insd */ 2012 case 0x6d: /* insw/insd */
1879 if (kvm_emulate_pio_string(ctxt->vcpu, 2013 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2014 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2015 kvm_inject_gp(ctxt->vcpu, 0);
2016 goto done;
2017 }
2018 if (kvm_emulate_pio_string(ctxt->vcpu,
1880 1, 2019 1,
1881 (c->d & ByteOp) ? 1 : c->op_bytes, 2020 (c->d & ByteOp) ? 1 : c->op_bytes,
1882 c->rep_prefix ? 2021 c->rep_prefix ?
@@ -1892,6 +2031,11 @@ special_insn:
1892 return 0; 2031 return 0;
1893 case 0x6e: /* outsb */ 2032 case 0x6e: /* outsb */
1894 case 0x6f: /* outsw/outsd */ 2033 case 0x6f: /* outsw/outsd */
2034 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2035 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2036 kvm_inject_gp(ctxt->vcpu, 0);
2037 goto done;
2038 }
1895 if (kvm_emulate_pio_string(ctxt->vcpu, 2039 if (kvm_emulate_pio_string(ctxt->vcpu,
1896 0, 2040 0,
1897 (c->d & ByteOp) ? 1 : c->op_bytes, 2041 (c->d & ByteOp) ? 1 : c->op_bytes,
@@ -1978,25 +2122,19 @@ special_insn:
1978 break; 2122 break;
1979 case 0x8e: { /* mov seg, r/m16 */ 2123 case 0x8e: { /* mov seg, r/m16 */
1980 uint16_t sel; 2124 uint16_t sel;
1981 int type_bits;
1982 int err;
1983 2125
1984 sel = c->src.val; 2126 sel = c->src.val;
1985 if (c->modrm_reg == VCPU_SREG_SS)
1986 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
1987 2127
1988 if (c->modrm_reg <= 5) { 2128 if (c->modrm_reg == VCPU_SREG_CS ||
1989 type_bits = (c->modrm_reg == 1) ? 9 : 1; 2129 c->modrm_reg > VCPU_SREG_GS) {
1990 err = kvm_load_segment_descriptor(ctxt->vcpu, sel, 2130 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1991 type_bits, c->modrm_reg); 2131 goto done;
1992 } else {
1993 printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
1994 c->modrm);
1995 goto cannot_emulate;
1996 } 2132 }
1997 2133
1998 if (err < 0) 2134 if (c->modrm_reg == VCPU_SREG_SS)
1999 goto cannot_emulate; 2135 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
2136
2137 rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
2000 2138
2001 c->dst.type = OP_NONE; /* Disable writeback. */ 2139 c->dst.type = OP_NONE; /* Disable writeback. */
2002 break; 2140 break;
@@ -2025,7 +2163,10 @@ special_insn:
2025 c->dst.type = OP_REG; 2163 c->dst.type = OP_REG;
2026 c->dst.ptr = (unsigned long *) &ctxt->eflags; 2164 c->dst.ptr = (unsigned long *) &ctxt->eflags;
2027 c->dst.bytes = c->op_bytes; 2165 c->dst.bytes = c->op_bytes;
2028 goto pop_instruction; 2166 rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
2167 if (rc != X86EMUL_CONTINUE)
2168 goto done;
2169 break;
2029 case 0xa0 ... 0xa1: /* mov */ 2170 case 0xa0 ... 0xa1: /* mov */
2030 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 2171 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
2031 c->dst.val = c->src.val; 2172 c->dst.val = c->src.val;
@@ -2039,11 +2180,12 @@ special_insn:
2039 c->dst.ptr = (unsigned long *)register_address(c, 2180 c->dst.ptr = (unsigned long *)register_address(c,
2040 es_base(ctxt), 2181 es_base(ctxt),
2041 c->regs[VCPU_REGS_RDI]); 2182 c->regs[VCPU_REGS_RDI]);
2042 if ((rc = ops->read_emulated(register_address(c, 2183 rc = ops->read_emulated(register_address(c,
2043 seg_override_base(ctxt, c), 2184 seg_override_base(ctxt, c),
2044 c->regs[VCPU_REGS_RSI]), 2185 c->regs[VCPU_REGS_RSI]),
2045 &c->dst.val, 2186 &c->dst.val,
2046 c->dst.bytes, ctxt->vcpu)) != 0) 2187 c->dst.bytes, ctxt->vcpu);
2188 if (rc != X86EMUL_CONTINUE)
2047 goto done; 2189 goto done;
2048 register_address_increment(c, &c->regs[VCPU_REGS_RSI], 2190 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2049 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes 2191 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
@@ -2058,10 +2200,11 @@ special_insn:
2058 c->src.ptr = (unsigned long *)register_address(c, 2200 c->src.ptr = (unsigned long *)register_address(c,
2059 seg_override_base(ctxt, c), 2201 seg_override_base(ctxt, c),
2060 c->regs[VCPU_REGS_RSI]); 2202 c->regs[VCPU_REGS_RSI]);
2061 if ((rc = ops->read_emulated((unsigned long)c->src.ptr, 2203 rc = ops->read_emulated((unsigned long)c->src.ptr,
2062 &c->src.val, 2204 &c->src.val,
2063 c->src.bytes, 2205 c->src.bytes,
2064 ctxt->vcpu)) != 0) 2206 ctxt->vcpu);
2207 if (rc != X86EMUL_CONTINUE)
2065 goto done; 2208 goto done;
2066 2209
2067 c->dst.type = OP_NONE; /* Disable writeback. */ 2210 c->dst.type = OP_NONE; /* Disable writeback. */
@@ -2069,10 +2212,11 @@ special_insn:
2069 c->dst.ptr = (unsigned long *)register_address(c, 2212 c->dst.ptr = (unsigned long *)register_address(c,
2070 es_base(ctxt), 2213 es_base(ctxt),
2071 c->regs[VCPU_REGS_RDI]); 2214 c->regs[VCPU_REGS_RDI]);
2072 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 2215 rc = ops->read_emulated((unsigned long)c->dst.ptr,
2073 &c->dst.val, 2216 &c->dst.val,
2074 c->dst.bytes, 2217 c->dst.bytes,
2075 ctxt->vcpu)) != 0) 2218 ctxt->vcpu);
2219 if (rc != X86EMUL_CONTINUE)
2076 goto done; 2220 goto done;
2077 2221
2078 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 2222 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
@@ -2102,12 +2246,13 @@ special_insn:
2102 c->dst.type = OP_REG; 2246 c->dst.type = OP_REG;
2103 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2247 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2104 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 2248 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
2105 if ((rc = ops->read_emulated(register_address(c, 2249 rc = ops->read_emulated(register_address(c,
2106 seg_override_base(ctxt, c), 2250 seg_override_base(ctxt, c),
2107 c->regs[VCPU_REGS_RSI]), 2251 c->regs[VCPU_REGS_RSI]),
2108 &c->dst.val, 2252 &c->dst.val,
2109 c->dst.bytes, 2253 c->dst.bytes,
2110 ctxt->vcpu)) != 0) 2254 ctxt->vcpu);
2255 if (rc != X86EMUL_CONTINUE)
2111 goto done; 2256 goto done;
2112 register_address_increment(c, &c->regs[VCPU_REGS_RSI], 2257 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2113 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes 2258 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
@@ -2163,11 +2308,9 @@ special_insn:
2163 case 0xe9: /* jmp rel */ 2308 case 0xe9: /* jmp rel */
2164 goto jmp; 2309 goto jmp;
2165 case 0xea: /* jmp far */ 2310 case 0xea: /* jmp far */
2166 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, 2311 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
2167 VCPU_SREG_CS) < 0) { 2312 VCPU_SREG_CS))
2168 DPRINTF("jmp far: Failed to load CS descriptor\n"); 2313 goto done;
2169 goto cannot_emulate;
2170 }
2171 2314
2172 c->eip = c->src.val; 2315 c->eip = c->src.val;
2173 break; 2316 break;
@@ -2185,7 +2328,13 @@ special_insn:
2185 case 0xef: /* out (e/r)ax,dx */ 2328 case 0xef: /* out (e/r)ax,dx */
2186 port = c->regs[VCPU_REGS_RDX]; 2329 port = c->regs[VCPU_REGS_RDX];
2187 io_dir_in = 0; 2330 io_dir_in = 0;
2188 do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, 2331 do_io:
2332 if (!emulator_io_permited(ctxt, ops, port,
2333 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2334 kvm_inject_gp(ctxt->vcpu, 0);
2335 goto done;
2336 }
2337 if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
2189 (c->d & ByteOp) ? 1 : c->op_bytes, 2338 (c->d & ByteOp) ? 1 : c->op_bytes,
2190 port) != 0) { 2339 port) != 0) {
2191 c->eip = saved_eip; 2340 c->eip = saved_eip;
@@ -2210,13 +2359,21 @@ special_insn:
2210 c->dst.type = OP_NONE; /* Disable writeback. */ 2359 c->dst.type = OP_NONE; /* Disable writeback. */
2211 break; 2360 break;
2212 case 0xfa: /* cli */ 2361 case 0xfa: /* cli */
2213 ctxt->eflags &= ~X86_EFLAGS_IF; 2362 if (emulator_bad_iopl(ctxt))
2214 c->dst.type = OP_NONE; /* Disable writeback. */ 2363 kvm_inject_gp(ctxt->vcpu, 0);
2364 else {
2365 ctxt->eflags &= ~X86_EFLAGS_IF;
2366 c->dst.type = OP_NONE; /* Disable writeback. */
2367 }
2215 break; 2368 break;
2216 case 0xfb: /* sti */ 2369 case 0xfb: /* sti */
2217 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); 2370 if (emulator_bad_iopl(ctxt))
2218 ctxt->eflags |= X86_EFLAGS_IF; 2371 kvm_inject_gp(ctxt->vcpu, 0);
2219 c->dst.type = OP_NONE; /* Disable writeback. */ 2372 else {
2373 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
2374 ctxt->eflags |= X86_EFLAGS_IF;
2375 c->dst.type = OP_NONE; /* Disable writeback. */
2376 }
2220 break; 2377 break;
2221 case 0xfc: /* cld */ 2378 case 0xfc: /* cld */
2222 ctxt->eflags &= ~EFLG_DF; 2379 ctxt->eflags &= ~EFLG_DF;
@@ -2319,8 +2476,9 @@ twobyte_insn:
2319 } 2476 }
2320 break; 2477 break;
2321 case 0x05: /* syscall */ 2478 case 0x05: /* syscall */
2322 if (emulate_syscall(ctxt) == -1) 2479 rc = emulate_syscall(ctxt);
2323 goto cannot_emulate; 2480 if (rc != X86EMUL_CONTINUE)
2481 goto done;
2324 else 2482 else
2325 goto writeback; 2483 goto writeback;
2326 break; 2484 break;
@@ -2391,14 +2549,16 @@ twobyte_insn:
2391 c->dst.type = OP_NONE; 2549 c->dst.type = OP_NONE;
2392 break; 2550 break;
2393 case 0x34: /* sysenter */ 2551 case 0x34: /* sysenter */
2394 if (emulate_sysenter(ctxt) == -1) 2552 rc = emulate_sysenter(ctxt);
2395 goto cannot_emulate; 2553 if (rc != X86EMUL_CONTINUE)
2554 goto done;
2396 else 2555 else
2397 goto writeback; 2556 goto writeback;
2398 break; 2557 break;
2399 case 0x35: /* sysexit */ 2558 case 0x35: /* sysexit */
2400 if (emulate_sysexit(ctxt) == -1) 2559 rc = emulate_sysexit(ctxt);
2401 goto cannot_emulate; 2560 if (rc != X86EMUL_CONTINUE)
2561 goto done;
2402 else 2562 else
2403 goto writeback; 2563 goto writeback;
2404 break; 2564 break;
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 15578f180e59..294698b6daff 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -242,11 +242,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
242{ 242{
243 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, 243 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
244 irq_ack_notifier); 244 irq_ack_notifier);
245 spin_lock(&ps->inject_lock); 245 raw_spin_lock(&ps->inject_lock);
246 if (atomic_dec_return(&ps->pit_timer.pending) < 0) 246 if (atomic_dec_return(&ps->pit_timer.pending) < 0)
247 atomic_inc(&ps->pit_timer.pending); 247 atomic_inc(&ps->pit_timer.pending);
248 ps->irq_ack = 1; 248 ps->irq_ack = 1;
249 spin_unlock(&ps->inject_lock); 249 raw_spin_unlock(&ps->inject_lock);
250} 250}
251 251
252void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) 252void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -605,7 +605,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = {
605 .write = speaker_ioport_write, 605 .write = speaker_ioport_write,
606}; 606};
607 607
608/* Caller must have writers lock on slots_lock */ 608/* Caller must hold slots_lock */
609struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) 609struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
610{ 610{
611 struct kvm_pit *pit; 611 struct kvm_pit *pit;
@@ -624,7 +624,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
624 624
625 mutex_init(&pit->pit_state.lock); 625 mutex_init(&pit->pit_state.lock);
626 mutex_lock(&pit->pit_state.lock); 626 mutex_lock(&pit->pit_state.lock);
627 spin_lock_init(&pit->pit_state.inject_lock); 627 raw_spin_lock_init(&pit->pit_state.inject_lock);
628 628
629 kvm->arch.vpit = pit; 629 kvm->arch.vpit = pit;
630 pit->kvm = kvm; 630 pit->kvm = kvm;
@@ -645,13 +645,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
645 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); 645 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
646 646
647 kvm_iodevice_init(&pit->dev, &pit_dev_ops); 647 kvm_iodevice_init(&pit->dev, &pit_dev_ops);
648 ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); 648 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);
649 if (ret < 0) 649 if (ret < 0)
650 goto fail; 650 goto fail;
651 651
652 if (flags & KVM_PIT_SPEAKER_DUMMY) { 652 if (flags & KVM_PIT_SPEAKER_DUMMY) {
653 kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); 653 kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
654 ret = __kvm_io_bus_register_dev(&kvm->pio_bus, 654 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
655 &pit->speaker_dev); 655 &pit->speaker_dev);
656 if (ret < 0) 656 if (ret < 0)
657 goto fail_unregister; 657 goto fail_unregister;
@@ -660,11 +660,12 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
660 return pit; 660 return pit;
661 661
662fail_unregister: 662fail_unregister:
663 __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); 663 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
664 664
665fail: 665fail:
666 if (pit->irq_source_id >= 0) 666 kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
667 kvm_free_irq_source_id(kvm, pit->irq_source_id); 667 kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
668 kvm_free_irq_source_id(kvm, pit->irq_source_id);
668 669
669 kfree(pit); 670 kfree(pit);
670 return NULL; 671 return NULL;
@@ -723,12 +724,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
723 /* Try to inject pending interrupts when 724 /* Try to inject pending interrupts when
724 * last one has been acked. 725 * last one has been acked.
725 */ 726 */
726 spin_lock(&ps->inject_lock); 727 raw_spin_lock(&ps->inject_lock);
727 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { 728 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
728 ps->irq_ack = 0; 729 ps->irq_ack = 0;
729 inject = 1; 730 inject = 1;
730 } 731 }
731 spin_unlock(&ps->inject_lock); 732 raw_spin_unlock(&ps->inject_lock);
732 if (inject) 733 if (inject)
733 __inject_pit_timer_intr(kvm); 734 __inject_pit_timer_intr(kvm);
734 } 735 }
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index d4c1c7ffdc09..900d6b0ba7c2 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
27 u32 speaker_data_on; 27 u32 speaker_data_on;
28 struct mutex lock; 28 struct mutex lock;
29 struct kvm_pit *pit; 29 struct kvm_pit *pit;
30 spinlock_t inject_lock; 30 raw_spinlock_t inject_lock;
31 unsigned long irq_ack; 31 unsigned long irq_ack;
32 struct kvm_irq_ack_notifier irq_ack_notifier; 32 struct kvm_irq_ack_notifier irq_ack_notifier;
33}; 33};
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index d057c0cbd245..07771da85de5 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -44,18 +44,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
44 * Other interrupt may be delivered to PIC while lock is dropped but 44 * Other interrupt may be delivered to PIC while lock is dropped but
45 * it should be safe since PIC state is already updated at this stage. 45 * it should be safe since PIC state is already updated at this stage.
46 */ 46 */
47 spin_unlock(&s->pics_state->lock); 47 raw_spin_unlock(&s->pics_state->lock);
48 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); 48 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
49 spin_lock(&s->pics_state->lock); 49 raw_spin_lock(&s->pics_state->lock);
50} 50}
51 51
52void kvm_pic_clear_isr_ack(struct kvm *kvm) 52void kvm_pic_clear_isr_ack(struct kvm *kvm)
53{ 53{
54 struct kvm_pic *s = pic_irqchip(kvm); 54 struct kvm_pic *s = pic_irqchip(kvm);
55 spin_lock(&s->lock); 55
56 raw_spin_lock(&s->lock);
56 s->pics[0].isr_ack = 0xff; 57 s->pics[0].isr_ack = 0xff;
57 s->pics[1].isr_ack = 0xff; 58 s->pics[1].isr_ack = 0xff;
58 spin_unlock(&s->lock); 59 raw_spin_unlock(&s->lock);
59} 60}
60 61
61/* 62/*
@@ -156,9 +157,9 @@ static void pic_update_irq(struct kvm_pic *s)
156 157
157void kvm_pic_update_irq(struct kvm_pic *s) 158void kvm_pic_update_irq(struct kvm_pic *s)
158{ 159{
159 spin_lock(&s->lock); 160 raw_spin_lock(&s->lock);
160 pic_update_irq(s); 161 pic_update_irq(s);
161 spin_unlock(&s->lock); 162 raw_spin_unlock(&s->lock);
162} 163}
163 164
164int kvm_pic_set_irq(void *opaque, int irq, int level) 165int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -166,14 +167,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
166 struct kvm_pic *s = opaque; 167 struct kvm_pic *s = opaque;
167 int ret = -1; 168 int ret = -1;
168 169
169 spin_lock(&s->lock); 170 raw_spin_lock(&s->lock);
170 if (irq >= 0 && irq < PIC_NUM_PINS) { 171 if (irq >= 0 && irq < PIC_NUM_PINS) {
171 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 172 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
172 pic_update_irq(s); 173 pic_update_irq(s);
173 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 174 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
174 s->pics[irq >> 3].imr, ret == 0); 175 s->pics[irq >> 3].imr, ret == 0);
175 } 176 }
176 spin_unlock(&s->lock); 177 raw_spin_unlock(&s->lock);
177 178
178 return ret; 179 return ret;
179} 180}
@@ -203,7 +204,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
203 int irq, irq2, intno; 204 int irq, irq2, intno;
204 struct kvm_pic *s = pic_irqchip(kvm); 205 struct kvm_pic *s = pic_irqchip(kvm);
205 206
206 spin_lock(&s->lock); 207 raw_spin_lock(&s->lock);
207 irq = pic_get_irq(&s->pics[0]); 208 irq = pic_get_irq(&s->pics[0]);
208 if (irq >= 0) { 209 if (irq >= 0) {
209 pic_intack(&s->pics[0], irq); 210 pic_intack(&s->pics[0], irq);
@@ -228,7 +229,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
228 intno = s->pics[0].irq_base + irq; 229 intno = s->pics[0].irq_base + irq;
229 } 230 }
230 pic_update_irq(s); 231 pic_update_irq(s);
231 spin_unlock(&s->lock); 232 raw_spin_unlock(&s->lock);
232 233
233 return intno; 234 return intno;
234} 235}
@@ -442,7 +443,7 @@ static int picdev_write(struct kvm_io_device *this,
442 printk(KERN_ERR "PIC: non byte write\n"); 443 printk(KERN_ERR "PIC: non byte write\n");
443 return 0; 444 return 0;
444 } 445 }
445 spin_lock(&s->lock); 446 raw_spin_lock(&s->lock);
446 switch (addr) { 447 switch (addr) {
447 case 0x20: 448 case 0x20:
448 case 0x21: 449 case 0x21:
@@ -455,7 +456,7 @@ static int picdev_write(struct kvm_io_device *this,
455 elcr_ioport_write(&s->pics[addr & 1], addr, data); 456 elcr_ioport_write(&s->pics[addr & 1], addr, data);
456 break; 457 break;
457 } 458 }
458 spin_unlock(&s->lock); 459 raw_spin_unlock(&s->lock);
459 return 0; 460 return 0;
460} 461}
461 462
@@ -472,7 +473,7 @@ static int picdev_read(struct kvm_io_device *this,
472 printk(KERN_ERR "PIC: non byte read\n"); 473 printk(KERN_ERR "PIC: non byte read\n");
473 return 0; 474 return 0;
474 } 475 }
475 spin_lock(&s->lock); 476 raw_spin_lock(&s->lock);
476 switch (addr) { 477 switch (addr) {
477 case 0x20: 478 case 0x20:
478 case 0x21: 479 case 0x21:
@@ -486,7 +487,7 @@ static int picdev_read(struct kvm_io_device *this,
486 break; 487 break;
487 } 488 }
488 *(unsigned char *)val = data; 489 *(unsigned char *)val = data;
489 spin_unlock(&s->lock); 490 raw_spin_unlock(&s->lock);
490 return 0; 491 return 0;
491} 492}
492 493
@@ -520,7 +521,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
520 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); 521 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
521 if (!s) 522 if (!s)
522 return NULL; 523 return NULL;
523 spin_lock_init(&s->lock); 524 raw_spin_lock_init(&s->lock);
524 s->kvm = kvm; 525 s->kvm = kvm;
525 s->pics[0].elcr_mask = 0xf8; 526 s->pics[0].elcr_mask = 0xf8;
526 s->pics[1].elcr_mask = 0xde; 527 s->pics[1].elcr_mask = 0xde;
@@ -533,7 +534,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
533 * Initialize PIO device 534 * Initialize PIO device
534 */ 535 */
535 kvm_iodevice_init(&s->dev, &picdev_ops); 536 kvm_iodevice_init(&s->dev, &picdev_ops);
536 ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); 537 mutex_lock(&kvm->slots_lock);
538 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev);
539 mutex_unlock(&kvm->slots_lock);
537 if (ret < 0) { 540 if (ret < 0) {
538 kfree(s); 541 kfree(s);
539 return NULL; 542 return NULL;
@@ -541,3 +544,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
541 544
542 return s; 545 return s;
543} 546}
547
548void kvm_destroy_pic(struct kvm *kvm)
549{
550 struct kvm_pic *vpic = kvm->arch.vpic;
551
552 if (vpic) {
553 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev);
554 kvm->arch.vpic = NULL;
555 kfree(vpic);
556 }
557}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index be399e207d57..34b15915754d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -62,7 +62,7 @@ struct kvm_kpic_state {
62}; 62};
63 63
64struct kvm_pic { 64struct kvm_pic {
65 spinlock_t lock; 65 raw_spinlock_t lock;
66 unsigned pending_acks; 66 unsigned pending_acks;
67 struct kvm *kvm; 67 struct kvm *kvm;
68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
@@ -75,6 +75,7 @@ struct kvm_pic {
75}; 75};
76 76
77struct kvm_pic *kvm_create_pic(struct kvm *kvm); 77struct kvm_pic *kvm_create_pic(struct kvm *kvm);
78void kvm_destroy_pic(struct kvm *kvm);
78int kvm_pic_read_irq(struct kvm *kvm); 79int kvm_pic_read_irq(struct kvm *kvm);
79void kvm_pic_update_irq(struct kvm_pic *s); 80void kvm_pic_update_irq(struct kvm_pic *s);
80void kvm_pic_clear_isr_ack(struct kvm *kvm); 81void kvm_pic_clear_isr_ack(struct kvm *kvm);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 7bcc5b6a4403..cff851cf5322 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -1,6 +1,11 @@
1#ifndef ASM_KVM_CACHE_REGS_H 1#ifndef ASM_KVM_CACHE_REGS_H
2#define ASM_KVM_CACHE_REGS_H 2#define ASM_KVM_CACHE_REGS_H
3 3
4#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
5#define KVM_POSSIBLE_CR4_GUEST_BITS \
6 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
7 | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
8
4static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, 9static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
5 enum kvm_reg reg) 10 enum kvm_reg reg)
6{ 11{
@@ -38,4 +43,30 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
38 return vcpu->arch.pdptrs[index]; 43 return vcpu->arch.pdptrs[index];
39} 44}
40 45
46static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
47{
48 ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
49 if (tmask & vcpu->arch.cr0_guest_owned_bits)
50 kvm_x86_ops->decache_cr0_guest_bits(vcpu);
51 return vcpu->arch.cr0 & mask;
52}
53
54static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
55{
56 return kvm_read_cr0_bits(vcpu, ~0UL);
57}
58
59static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
60{
61 ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
62 if (tmask & vcpu->arch.cr4_guest_owned_bits)
63 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
64 return vcpu->arch.cr4 & mask;
65}
66
67static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
68{
69 return kvm_read_cr4_bits(vcpu, ~0UL);
70}
71
41#endif 72#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ba8c045da782..4b224f90087b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1246,3 +1246,34 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
1246 1246
1247 return 0; 1247 return 0;
1248} 1248}
1249
1250int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
1251{
1252 struct kvm_lapic *apic = vcpu->arch.apic;
1253
1254 if (!irqchip_in_kernel(vcpu->kvm))
1255 return 1;
1256
1257 /* if this is ICR write vector before command */
1258 if (reg == APIC_ICR)
1259 apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
1260 return apic_reg_write(apic, reg, (u32)data);
1261}
1262
1263int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
1264{
1265 struct kvm_lapic *apic = vcpu->arch.apic;
1266 u32 low, high = 0;
1267
1268 if (!irqchip_in_kernel(vcpu->kvm))
1269 return 1;
1270
1271 if (apic_reg_read(apic, reg, 4, &low))
1272 return 1;
1273 if (reg == APIC_ICR)
1274 apic_reg_read(apic, APIC_ICR2, 4, &high);
1275
1276 *data = (((u64)high) << 32) | low;
1277
1278 return 0;
1279}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 40010b09c4aa..f5fe32c5edad 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -48,4 +48,12 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
48 48
49int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); 49int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
50int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); 50int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
51
52int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
53int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
54
55static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
56{
57 return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
58}
51#endif 59#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 89a49fb46a27..741373e8ca77 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include "mmu.h" 20#include "mmu.h"
21#include "x86.h"
21#include "kvm_cache_regs.h" 22#include "kvm_cache_regs.h"
22 23
23#include <linux/kvm_host.h> 24#include <linux/kvm_host.h>
@@ -29,6 +30,7 @@
29#include <linux/swap.h> 30#include <linux/swap.h>
30#include <linux/hugetlb.h> 31#include <linux/hugetlb.h>
31#include <linux/compiler.h> 32#include <linux/compiler.h>
33#include <linux/srcu.h>
32 34
33#include <asm/page.h> 35#include <asm/page.h>
34#include <asm/cmpxchg.h> 36#include <asm/cmpxchg.h>
@@ -136,16 +138,6 @@ module_param(oos_shadow, bool, 0644);
136#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 138#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
137 | PT64_NX_MASK) 139 | PT64_NX_MASK)
138 140
139#define PFERR_PRESENT_MASK (1U << 0)
140#define PFERR_WRITE_MASK (1U << 1)
141#define PFERR_USER_MASK (1U << 2)
142#define PFERR_RSVD_MASK (1U << 3)
143#define PFERR_FETCH_MASK (1U << 4)
144
145#define PT_PDPE_LEVEL 3
146#define PT_DIRECTORY_LEVEL 2
147#define PT_PAGE_TABLE_LEVEL 1
148
149#define RMAP_EXT 4 141#define RMAP_EXT 4
150 142
151#define ACC_EXEC_MASK 1 143#define ACC_EXEC_MASK 1
@@ -153,6 +145,9 @@ module_param(oos_shadow, bool, 0644);
153#define ACC_USER_MASK PT_USER_MASK 145#define ACC_USER_MASK PT_USER_MASK
154#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 146#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
155 147
148#include <trace/events/kvm.h>
149
150#undef TRACE_INCLUDE_FILE
156#define CREATE_TRACE_POINTS 151#define CREATE_TRACE_POINTS
157#include "mmutrace.h" 152#include "mmutrace.h"
158 153
@@ -229,7 +224,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
229 224
230static int is_write_protection(struct kvm_vcpu *vcpu) 225static int is_write_protection(struct kvm_vcpu *vcpu)
231{ 226{
232 return vcpu->arch.cr0 & X86_CR0_WP; 227 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
233} 228}
234 229
235static int is_cpuid_PSE36(void) 230static int is_cpuid_PSE36(void)
@@ -239,7 +234,7 @@ static int is_cpuid_PSE36(void)
239 234
240static int is_nx(struct kvm_vcpu *vcpu) 235static int is_nx(struct kvm_vcpu *vcpu)
241{ 236{
242 return vcpu->arch.shadow_efer & EFER_NX; 237 return vcpu->arch.efer & EFER_NX;
243} 238}
244 239
245static int is_shadow_present_pte(u64 pte) 240static int is_shadow_present_pte(u64 pte)
@@ -253,7 +248,7 @@ static int is_large_pte(u64 pte)
253 return pte & PT_PAGE_SIZE_MASK; 248 return pte & PT_PAGE_SIZE_MASK;
254} 249}
255 250
256static int is_writeble_pte(unsigned long pte) 251static int is_writable_pte(unsigned long pte)
257{ 252{
258 return pte & PT_WRITABLE_MASK; 253 return pte & PT_WRITABLE_MASK;
259} 254}
@@ -470,24 +465,10 @@ static int has_wrprotected_page(struct kvm *kvm,
470 465
471static int host_mapping_level(struct kvm *kvm, gfn_t gfn) 466static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
472{ 467{
473 unsigned long page_size = PAGE_SIZE; 468 unsigned long page_size;
474 struct vm_area_struct *vma;
475 unsigned long addr;
476 int i, ret = 0; 469 int i, ret = 0;
477 470
478 addr = gfn_to_hva(kvm, gfn); 471 page_size = kvm_host_page_size(kvm, gfn);
479 if (kvm_is_error_hva(addr))
480 return PT_PAGE_TABLE_LEVEL;
481
482 down_read(&current->mm->mmap_sem);
483 vma = find_vma(current->mm, addr);
484 if (!vma)
485 goto out;
486
487 page_size = vma_kernel_pagesize(vma);
488
489out:
490 up_read(&current->mm->mmap_sem);
491 472
492 for (i = PT_PAGE_TABLE_LEVEL; 473 for (i = PT_PAGE_TABLE_LEVEL;
493 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { 474 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
@@ -503,8 +484,7 @@ out:
503static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 484static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
504{ 485{
505 struct kvm_memory_slot *slot; 486 struct kvm_memory_slot *slot;
506 int host_level; 487 int host_level, level, max_level;
507 int level = PT_PAGE_TABLE_LEVEL;
508 488
509 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 489 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
510 if (slot && slot->dirty_bitmap) 490 if (slot && slot->dirty_bitmap)
@@ -515,7 +495,10 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
515 if (host_level == PT_PAGE_TABLE_LEVEL) 495 if (host_level == PT_PAGE_TABLE_LEVEL)
516 return host_level; 496 return host_level;
517 497
518 for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) 498 max_level = kvm_x86_ops->get_lpage_level() < host_level ?
499 kvm_x86_ops->get_lpage_level() : host_level;
500
501 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
519 if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) 502 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
520 break; 503 break;
521 504
@@ -633,7 +616,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
633 pfn = spte_to_pfn(*spte); 616 pfn = spte_to_pfn(*spte);
634 if (*spte & shadow_accessed_mask) 617 if (*spte & shadow_accessed_mask)
635 kvm_set_pfn_accessed(pfn); 618 kvm_set_pfn_accessed(pfn);
636 if (is_writeble_pte(*spte)) 619 if (is_writable_pte(*spte))
637 kvm_set_pfn_dirty(pfn); 620 kvm_set_pfn_dirty(pfn);
638 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); 621 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
639 if (!*rmapp) { 622 if (!*rmapp) {
@@ -662,6 +645,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
662 prev_desc = desc; 645 prev_desc = desc;
663 desc = desc->more; 646 desc = desc->more;
664 } 647 }
648 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
665 BUG(); 649 BUG();
666 } 650 }
667} 651}
@@ -708,7 +692,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
708 BUG_ON(!spte); 692 BUG_ON(!spte);
709 BUG_ON(!(*spte & PT_PRESENT_MASK)); 693 BUG_ON(!(*spte & PT_PRESENT_MASK));
710 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 694 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
711 if (is_writeble_pte(*spte)) { 695 if (is_writable_pte(*spte)) {
712 __set_spte(spte, *spte & ~PT_WRITABLE_MASK); 696 __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
713 write_protected = 1; 697 write_protected = 1;
714 } 698 }
@@ -732,7 +716,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
732 BUG_ON(!(*spte & PT_PRESENT_MASK)); 716 BUG_ON(!(*spte & PT_PRESENT_MASK));
733 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 717 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
734 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 718 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
735 if (is_writeble_pte(*spte)) { 719 if (is_writable_pte(*spte)) {
736 rmap_remove(kvm, spte); 720 rmap_remove(kvm, spte);
737 --kvm->stat.lpages; 721 --kvm->stat.lpages;
738 __set_spte(spte, shadow_trap_nonpresent_pte); 722 __set_spte(spte, shadow_trap_nonpresent_pte);
@@ -787,7 +771,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
787 771
788 new_spte &= ~PT_WRITABLE_MASK; 772 new_spte &= ~PT_WRITABLE_MASK;
789 new_spte &= ~SPTE_HOST_WRITEABLE; 773 new_spte &= ~SPTE_HOST_WRITEABLE;
790 if (is_writeble_pte(*spte)) 774 if (is_writable_pte(*spte))
791 kvm_set_pfn_dirty(spte_to_pfn(*spte)); 775 kvm_set_pfn_dirty(spte_to_pfn(*spte));
792 __set_spte(spte, new_spte); 776 __set_spte(spte, new_spte);
793 spte = rmap_next(kvm, rmapp, spte); 777 spte = rmap_next(kvm, rmapp, spte);
@@ -805,35 +789,32 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
805 unsigned long data)) 789 unsigned long data))
806{ 790{
807 int i, j; 791 int i, j;
792 int ret;
808 int retval = 0; 793 int retval = 0;
794 struct kvm_memslots *slots;
809 795
810 /* 796 slots = rcu_dereference(kvm->memslots);
811 * If mmap_sem isn't taken, we can look the memslots with only 797
812 * the mmu_lock by skipping over the slots with userspace_addr == 0. 798 for (i = 0; i < slots->nmemslots; i++) {
813 */ 799 struct kvm_memory_slot *memslot = &slots->memslots[i];
814 for (i = 0; i < kvm->nmemslots; i++) {
815 struct kvm_memory_slot *memslot = &kvm->memslots[i];
816 unsigned long start = memslot->userspace_addr; 800 unsigned long start = memslot->userspace_addr;
817 unsigned long end; 801 unsigned long end;
818 802
819 /* mmu_lock protects userspace_addr */
820 if (!start)
821 continue;
822
823 end = start + (memslot->npages << PAGE_SHIFT); 803 end = start + (memslot->npages << PAGE_SHIFT);
824 if (hva >= start && hva < end) { 804 if (hva >= start && hva < end) {
825 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 805 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
826 806
827 retval |= handler(kvm, &memslot->rmap[gfn_offset], 807 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
828 data);
829 808
830 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 809 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
831 int idx = gfn_offset; 810 int idx = gfn_offset;
832 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); 811 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
833 retval |= handler(kvm, 812 ret |= handler(kvm,
834 &memslot->lpage_info[j][idx].rmap_pde, 813 &memslot->lpage_info[j][idx].rmap_pde,
835 data); 814 data);
836 } 815 }
816 trace_kvm_age_page(hva, memslot, ret);
817 retval |= ret;
837 } 818 }
838 } 819 }
839 820
@@ -856,9 +837,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
856 u64 *spte; 837 u64 *spte;
857 int young = 0; 838 int young = 0;
858 839
859 /* always return old for EPT */ 840 /*
841 * Emulate the accessed bit for EPT, by checking if this page has
842 * an EPT mapping, and clearing it if it does. On the next access,
843 * a new EPT mapping will be established.
844 * This has some overhead, but not as much as the cost of swapping
845 * out actively used pages or breaking up actively used hugepages.
846 */
860 if (!shadow_accessed_mask) 847 if (!shadow_accessed_mask)
861 return 0; 848 return kvm_unmap_rmapp(kvm, rmapp, data);
862 849
863 spte = rmap_next(kvm, rmapp, NULL); 850 spte = rmap_next(kvm, rmapp, NULL);
864 while (spte) { 851 while (spte) {
@@ -1615,7 +1602,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1615 1602
1616static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) 1603static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1617{ 1604{
1618 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); 1605 int slot = memslot_id(kvm, gfn);
1619 struct kvm_mmu_page *sp = page_header(__pa(pte)); 1606 struct kvm_mmu_page *sp = page_header(__pa(pte));
1620 1607
1621 __set_bit(slot, sp->slot_bitmap); 1608 __set_bit(slot, sp->slot_bitmap);
@@ -1639,7 +1626,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1639{ 1626{
1640 struct page *page; 1627 struct page *page;
1641 1628
1642 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); 1629 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
1643 1630
1644 if (gpa == UNMAPPED_GVA) 1631 if (gpa == UNMAPPED_GVA)
1645 return NULL; 1632 return NULL;
@@ -1852,7 +1839,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1852 * is responsibility of mmu_get_page / kvm_sync_page. 1839 * is responsibility of mmu_get_page / kvm_sync_page.
1853 * Same reasoning can be applied to dirty page accounting. 1840 * Same reasoning can be applied to dirty page accounting.
1854 */ 1841 */
1855 if (!can_unsync && is_writeble_pte(*sptep)) 1842 if (!can_unsync && is_writable_pte(*sptep))
1856 goto set_pte; 1843 goto set_pte;
1857 1844
1858 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 1845 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
@@ -1860,7 +1847,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1860 __func__, gfn); 1847 __func__, gfn);
1861 ret = 1; 1848 ret = 1;
1862 pte_access &= ~ACC_WRITE_MASK; 1849 pte_access &= ~ACC_WRITE_MASK;
1863 if (is_writeble_pte(spte)) 1850 if (is_writable_pte(spte))
1864 spte &= ~PT_WRITABLE_MASK; 1851 spte &= ~PT_WRITABLE_MASK;
1865 } 1852 }
1866 } 1853 }
@@ -1881,7 +1868,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1881 bool reset_host_protection) 1868 bool reset_host_protection)
1882{ 1869{
1883 int was_rmapped = 0; 1870 int was_rmapped = 0;
1884 int was_writeble = is_writeble_pte(*sptep); 1871 int was_writable = is_writable_pte(*sptep);
1885 int rmap_count; 1872 int rmap_count;
1886 1873
1887 pgprintk("%s: spte %llx access %x write_fault %d" 1874 pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1932,7 +1919,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1932 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 1919 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1933 rmap_recycle(vcpu, sptep, gfn); 1920 rmap_recycle(vcpu, sptep, gfn);
1934 } else { 1921 } else {
1935 if (was_writeble) 1922 if (was_writable)
1936 kvm_release_pfn_dirty(pfn); 1923 kvm_release_pfn_dirty(pfn);
1937 else 1924 else
1938 kvm_release_pfn_clean(pfn); 1925 kvm_release_pfn_clean(pfn);
@@ -2162,8 +2149,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2162 spin_unlock(&vcpu->kvm->mmu_lock); 2149 spin_unlock(&vcpu->kvm->mmu_lock);
2163} 2150}
2164 2151
2165static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 2152static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2153 u32 access, u32 *error)
2166{ 2154{
2155 if (error)
2156 *error = 0;
2167 return vaddr; 2157 return vaddr;
2168} 2158}
2169 2159
@@ -2747,7 +2737,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2747 if (tdp_enabled) 2737 if (tdp_enabled)
2748 return 0; 2738 return 0;
2749 2739
2750 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); 2740 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2751 2741
2752 spin_lock(&vcpu->kvm->mmu_lock); 2742 spin_lock(&vcpu->kvm->mmu_lock);
2753 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2743 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
@@ -2847,16 +2837,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2847 */ 2837 */
2848 page = alloc_page(GFP_KERNEL | __GFP_DMA32); 2838 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
2849 if (!page) 2839 if (!page)
2850 goto error_1; 2840 return -ENOMEM;
2841
2851 vcpu->arch.mmu.pae_root = page_address(page); 2842 vcpu->arch.mmu.pae_root = page_address(page);
2852 for (i = 0; i < 4; ++i) 2843 for (i = 0; i < 4; ++i)
2853 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 2844 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2854 2845
2855 return 0; 2846 return 0;
2856
2857error_1:
2858 free_mmu_pages(vcpu);
2859 return -ENOMEM;
2860} 2847}
2861 2848
2862int kvm_mmu_create(struct kvm_vcpu *vcpu) 2849int kvm_mmu_create(struct kvm_vcpu *vcpu)
@@ -2936,10 +2923,9 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2936 spin_lock(&kvm_lock); 2923 spin_lock(&kvm_lock);
2937 2924
2938 list_for_each_entry(kvm, &vm_list, vm_list) { 2925 list_for_each_entry(kvm, &vm_list, vm_list) {
2939 int npages; 2926 int npages, idx;
2940 2927
2941 if (!down_read_trylock(&kvm->slots_lock)) 2928 idx = srcu_read_lock(&kvm->srcu);
2942 continue;
2943 spin_lock(&kvm->mmu_lock); 2929 spin_lock(&kvm->mmu_lock);
2944 npages = kvm->arch.n_alloc_mmu_pages - 2930 npages = kvm->arch.n_alloc_mmu_pages -
2945 kvm->arch.n_free_mmu_pages; 2931 kvm->arch.n_free_mmu_pages;
@@ -2952,7 +2938,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2952 nr_to_scan--; 2938 nr_to_scan--;
2953 2939
2954 spin_unlock(&kvm->mmu_lock); 2940 spin_unlock(&kvm->mmu_lock);
2955 up_read(&kvm->slots_lock); 2941 srcu_read_unlock(&kvm->srcu, idx);
2956 } 2942 }
2957 if (kvm_freed) 2943 if (kvm_freed)
2958 list_move_tail(&kvm_freed->vm_list, &vm_list); 2944 list_move_tail(&kvm_freed->vm_list, &vm_list);
@@ -3019,9 +3005,11 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3019 int i; 3005 int i;
3020 unsigned int nr_mmu_pages; 3006 unsigned int nr_mmu_pages;
3021 unsigned int nr_pages = 0; 3007 unsigned int nr_pages = 0;
3008 struct kvm_memslots *slots;
3022 3009
3023 for (i = 0; i < kvm->nmemslots; i++) 3010 slots = rcu_dereference(kvm->memslots);
3024 nr_pages += kvm->memslots[i].npages; 3011 for (i = 0; i < slots->nmemslots; i++)
3012 nr_pages += slots->memslots[i].npages;
3025 3013
3026 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; 3014 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
3027 nr_mmu_pages = max(nr_mmu_pages, 3015 nr_mmu_pages = max(nr_mmu_pages,
@@ -3246,7 +3234,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3246 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) 3234 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3247 audit_mappings_page(vcpu, ent, va, level - 1); 3235 audit_mappings_page(vcpu, ent, va, level - 1);
3248 else { 3236 else {
3249 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); 3237 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
3250 gfn_t gfn = gpa >> PAGE_SHIFT; 3238 gfn_t gfn = gpa >> PAGE_SHIFT;
3251 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); 3239 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3252 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; 3240 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
@@ -3291,10 +3279,12 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
3291static int count_rmaps(struct kvm_vcpu *vcpu) 3279static int count_rmaps(struct kvm_vcpu *vcpu)
3292{ 3280{
3293 int nmaps = 0; 3281 int nmaps = 0;
3294 int i, j, k; 3282 int i, j, k, idx;
3295 3283
3284 idx = srcu_read_lock(&kvm->srcu);
3285 slots = rcu_dereference(kvm->memslots);
3296 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 3286 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3297 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; 3287 struct kvm_memory_slot *m = &slots->memslots[i];
3298 struct kvm_rmap_desc *d; 3288 struct kvm_rmap_desc *d;
3299 3289
3300 for (j = 0; j < m->npages; ++j) { 3290 for (j = 0; j < m->npages; ++j) {
@@ -3317,6 +3307,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3317 } 3307 }
3318 } 3308 }
3319 } 3309 }
3310 srcu_read_unlock(&kvm->srcu, idx);
3320 return nmaps; 3311 return nmaps;
3321} 3312}
3322 3313
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 61a1b3884b49..be66759321a5 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -2,6 +2,7 @@
2#define __KVM_X86_MMU_H 2#define __KVM_X86_MMU_H
3 3
4#include <linux/kvm_host.h> 4#include <linux/kvm_host.h>
5#include "kvm_cache_regs.h"
5 6
6#define PT64_PT_BITS 9 7#define PT64_PT_BITS 9
7#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) 8#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
@@ -37,6 +38,16 @@
37#define PT32_ROOT_LEVEL 2 38#define PT32_ROOT_LEVEL 2
38#define PT32E_ROOT_LEVEL 3 39#define PT32E_ROOT_LEVEL 3
39 40
41#define PT_PDPE_LEVEL 3
42#define PT_DIRECTORY_LEVEL 2
43#define PT_PAGE_TABLE_LEVEL 1
44
45#define PFERR_PRESENT_MASK (1U << 0)
46#define PFERR_WRITE_MASK (1U << 1)
47#define PFERR_USER_MASK (1U << 2)
48#define PFERR_RSVD_MASK (1U << 3)
49#define PFERR_FETCH_MASK (1U << 4)
50
40int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); 51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
41 52
42static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 53static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
@@ -53,30 +64,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
53 return kvm_mmu_load(vcpu); 64 return kvm_mmu_load(vcpu);
54} 65}
55 66
56static inline int is_long_mode(struct kvm_vcpu *vcpu)
57{
58#ifdef CONFIG_X86_64
59 return vcpu->arch.shadow_efer & EFER_LMA;
60#else
61 return 0;
62#endif
63}
64
65static inline int is_pae(struct kvm_vcpu *vcpu)
66{
67 return vcpu->arch.cr4 & X86_CR4_PAE;
68}
69
70static inline int is_pse(struct kvm_vcpu *vcpu)
71{
72 return vcpu->arch.cr4 & X86_CR4_PSE;
73}
74
75static inline int is_paging(struct kvm_vcpu *vcpu)
76{
77 return vcpu->arch.cr0 & X86_CR0_PG;
78}
79
80static inline int is_present_gpte(unsigned long pte) 67static inline int is_present_gpte(unsigned long pte)
81{ 68{
82 return pte & PT_PRESENT_MASK; 69 return pte & PT_PRESENT_MASK;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ede2131a9225..81eab9a50e6a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -162,7 +162,7 @@ walk:
162 if (rsvd_fault) 162 if (rsvd_fault)
163 goto access_error; 163 goto access_error;
164 164
165 if (write_fault && !is_writeble_pte(pte)) 165 if (write_fault && !is_writable_pte(pte))
166 if (user_fault || is_write_protection(vcpu)) 166 if (user_fault || is_write_protection(vcpu))
167 goto access_error; 167 goto access_error;
168 168
@@ -490,18 +490,23 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
490 spin_unlock(&vcpu->kvm->mmu_lock); 490 spin_unlock(&vcpu->kvm->mmu_lock);
491} 491}
492 492
493static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 493static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
494 u32 *error)
494{ 495{
495 struct guest_walker walker; 496 struct guest_walker walker;
496 gpa_t gpa = UNMAPPED_GVA; 497 gpa_t gpa = UNMAPPED_GVA;
497 int r; 498 int r;
498 499
499 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); 500 r = FNAME(walk_addr)(&walker, vcpu, vaddr,
501 !!(access & PFERR_WRITE_MASK),
502 !!(access & PFERR_USER_MASK),
503 !!(access & PFERR_FETCH_MASK));
500 504
501 if (r) { 505 if (r) {
502 gpa = gfn_to_gpa(walker.gfn); 506 gpa = gfn_to_gpa(walker.gfn);
503 gpa |= vaddr & ~PAGE_MASK; 507 gpa |= vaddr & ~PAGE_MASK;
504 } 508 } else if (error)
509 *error = walker.error_code;
505 510
506 return gpa; 511 return gpa;
507} 512}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1d9b33843c80..52f78dd03010 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -231,7 +231,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
231 efer &= ~EFER_LME; 231 efer &= ~EFER_LME;
232 232
233 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 233 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
234 vcpu->arch.shadow_efer = efer; 234 vcpu->arch.efer = efer;
235} 235}
236 236
237static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 237static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -540,6 +540,8 @@ static void init_vmcb(struct vcpu_svm *svm)
540 struct vmcb_control_area *control = &svm->vmcb->control; 540 struct vmcb_control_area *control = &svm->vmcb->control;
541 struct vmcb_save_area *save = &svm->vmcb->save; 541 struct vmcb_save_area *save = &svm->vmcb->save;
542 542
543 svm->vcpu.fpu_active = 1;
544
543 control->intercept_cr_read = INTERCEPT_CR0_MASK | 545 control->intercept_cr_read = INTERCEPT_CR0_MASK |
544 INTERCEPT_CR3_MASK | 546 INTERCEPT_CR3_MASK |
545 INTERCEPT_CR4_MASK; 547 INTERCEPT_CR4_MASK;
@@ -552,13 +554,19 @@ static void init_vmcb(struct vcpu_svm *svm)
552 control->intercept_dr_read = INTERCEPT_DR0_MASK | 554 control->intercept_dr_read = INTERCEPT_DR0_MASK |
553 INTERCEPT_DR1_MASK | 555 INTERCEPT_DR1_MASK |
554 INTERCEPT_DR2_MASK | 556 INTERCEPT_DR2_MASK |
555 INTERCEPT_DR3_MASK; 557 INTERCEPT_DR3_MASK |
558 INTERCEPT_DR4_MASK |
559 INTERCEPT_DR5_MASK |
560 INTERCEPT_DR6_MASK |
561 INTERCEPT_DR7_MASK;
556 562
557 control->intercept_dr_write = INTERCEPT_DR0_MASK | 563 control->intercept_dr_write = INTERCEPT_DR0_MASK |
558 INTERCEPT_DR1_MASK | 564 INTERCEPT_DR1_MASK |
559 INTERCEPT_DR2_MASK | 565 INTERCEPT_DR2_MASK |
560 INTERCEPT_DR3_MASK | 566 INTERCEPT_DR3_MASK |
567 INTERCEPT_DR4_MASK |
561 INTERCEPT_DR5_MASK | 568 INTERCEPT_DR5_MASK |
569 INTERCEPT_DR6_MASK |
562 INTERCEPT_DR7_MASK; 570 INTERCEPT_DR7_MASK;
563 571
564 control->intercept_exceptions = (1 << PF_VECTOR) | 572 control->intercept_exceptions = (1 << PF_VECTOR) |
@@ -569,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm)
569 control->intercept = (1ULL << INTERCEPT_INTR) | 577 control->intercept = (1ULL << INTERCEPT_INTR) |
570 (1ULL << INTERCEPT_NMI) | 578 (1ULL << INTERCEPT_NMI) |
571 (1ULL << INTERCEPT_SMI) | 579 (1ULL << INTERCEPT_SMI) |
580 (1ULL << INTERCEPT_SELECTIVE_CR0) |
572 (1ULL << INTERCEPT_CPUID) | 581 (1ULL << INTERCEPT_CPUID) |
573 (1ULL << INTERCEPT_INVD) | 582 (1ULL << INTERCEPT_INVD) |
574 (1ULL << INTERCEPT_HLT) | 583 (1ULL << INTERCEPT_HLT) |
@@ -641,10 +650,8 @@ static void init_vmcb(struct vcpu_svm *svm)
641 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | 650 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
642 (1ULL << INTERCEPT_INVLPG)); 651 (1ULL << INTERCEPT_INVLPG));
643 control->intercept_exceptions &= ~(1 << PF_VECTOR); 652 control->intercept_exceptions &= ~(1 << PF_VECTOR);
644 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| 653 control->intercept_cr_read &= ~INTERCEPT_CR3_MASK;
645 INTERCEPT_CR3_MASK); 654 control->intercept_cr_write &= ~INTERCEPT_CR3_MASK;
646 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
647 INTERCEPT_CR3_MASK);
648 save->g_pat = 0x0007040600070406ULL; 655 save->g_pat = 0x0007040600070406ULL;
649 save->cr3 = 0; 656 save->cr3 = 0;
650 save->cr4 = 0; 657 save->cr4 = 0;
@@ -730,7 +737,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
730 init_vmcb(svm); 737 init_vmcb(svm);
731 738
732 fx_init(&svm->vcpu); 739 fx_init(&svm->vcpu);
733 svm->vcpu.fpu_active = 1;
734 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 740 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
735 if (kvm_vcpu_is_bsp(&svm->vcpu)) 741 if (kvm_vcpu_is_bsp(&svm->vcpu))
736 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 742 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
@@ -765,14 +771,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
765 if (unlikely(cpu != vcpu->cpu)) { 771 if (unlikely(cpu != vcpu->cpu)) {
766 u64 delta; 772 u64 delta;
767 773
768 /* 774 if (check_tsc_unstable()) {
769 * Make sure that the guest sees a monotonically 775 /*
770 * increasing TSC. 776 * Make sure that the guest sees a monotonically
771 */ 777 * increasing TSC.
772 delta = vcpu->arch.host_tsc - native_read_tsc(); 778 */
773 svm->vmcb->control.tsc_offset += delta; 779 delta = vcpu->arch.host_tsc - native_read_tsc();
774 if (is_nested(svm)) 780 svm->vmcb->control.tsc_offset += delta;
775 svm->nested.hsave->control.tsc_offset += delta; 781 if (is_nested(svm))
782 svm->nested.hsave->control.tsc_offset += delta;
783 }
776 vcpu->cpu = cpu; 784 vcpu->cpu = cpu;
777 kvm_migrate_timers(vcpu); 785 kvm_migrate_timers(vcpu);
778 svm->asid_generation = 0; 786 svm->asid_generation = 0;
@@ -954,42 +962,59 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
954 svm->vmcb->save.gdtr.base = dt->base ; 962 svm->vmcb->save.gdtr.base = dt->base ;
955} 963}
956 964
965static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
966{
967}
968
957static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 969static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
958{ 970{
959} 971}
960 972
973static void update_cr0_intercept(struct vcpu_svm *svm)
974{
975 ulong gcr0 = svm->vcpu.arch.cr0;
976 u64 *hcr0 = &svm->vmcb->save.cr0;
977
978 if (!svm->vcpu.fpu_active)
979 *hcr0 |= SVM_CR0_SELECTIVE_MASK;
980 else
981 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
982 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
983
984
985 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
986 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
987 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
988 } else {
989 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
990 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
991 }
992}
993
961static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 994static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
962{ 995{
963 struct vcpu_svm *svm = to_svm(vcpu); 996 struct vcpu_svm *svm = to_svm(vcpu);
964 997
965#ifdef CONFIG_X86_64 998#ifdef CONFIG_X86_64
966 if (vcpu->arch.shadow_efer & EFER_LME) { 999 if (vcpu->arch.efer & EFER_LME) {
967 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1000 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
968 vcpu->arch.shadow_efer |= EFER_LMA; 1001 vcpu->arch.efer |= EFER_LMA;
969 svm->vmcb->save.efer |= EFER_LMA | EFER_LME; 1002 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
970 } 1003 }
971 1004
972 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { 1005 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
973 vcpu->arch.shadow_efer &= ~EFER_LMA; 1006 vcpu->arch.efer &= ~EFER_LMA;
974 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); 1007 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
975 } 1008 }
976 } 1009 }
977#endif 1010#endif
978 if (npt_enabled) 1011 vcpu->arch.cr0 = cr0;
979 goto set;
980 1012
981 if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { 1013 if (!npt_enabled)
982 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1014 cr0 |= X86_CR0_PG | X86_CR0_WP;
983 vcpu->fpu_active = 1;
984 }
985 1015
986 vcpu->arch.cr0 = cr0; 1016 if (!vcpu->fpu_active)
987 cr0 |= X86_CR0_PG | X86_CR0_WP;
988 if (!vcpu->fpu_active) {
989 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
990 cr0 |= X86_CR0_TS; 1017 cr0 |= X86_CR0_TS;
991 }
992set:
993 /* 1018 /*
994 * re-enable caching here because the QEMU bios 1019 * re-enable caching here because the QEMU bios
995 * does not do it - this results in some delay at 1020 * does not do it - this results in some delay at
@@ -997,6 +1022,7 @@ set:
997 */ 1022 */
998 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1023 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
999 svm->vmcb->save.cr0 = cr0; 1024 svm->vmcb->save.cr0 = cr0;
1025 update_cr0_intercept(svm);
1000} 1026}
1001 1027
1002static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1028static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1102,76 +1128,70 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1102 svm->vmcb->control.asid = sd->next_asid++; 1128 svm->vmcb->control.asid = sd->next_asid++;
1103} 1129}
1104 1130
1105static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) 1131static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest)
1106{ 1132{
1107 struct vcpu_svm *svm = to_svm(vcpu); 1133 struct vcpu_svm *svm = to_svm(vcpu);
1108 unsigned long val;
1109 1134
1110 switch (dr) { 1135 switch (dr) {
1111 case 0 ... 3: 1136 case 0 ... 3:
1112 val = vcpu->arch.db[dr]; 1137 *dest = vcpu->arch.db[dr];
1113 break; 1138 break;
1139 case 4:
1140 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1141 return EMULATE_FAIL; /* will re-inject UD */
1142 /* fall through */
1114 case 6: 1143 case 6:
1115 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1144 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1116 val = vcpu->arch.dr6; 1145 *dest = vcpu->arch.dr6;
1117 else 1146 else
1118 val = svm->vmcb->save.dr6; 1147 *dest = svm->vmcb->save.dr6;
1119 break; 1148 break;
1149 case 5:
1150 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1151 return EMULATE_FAIL; /* will re-inject UD */
1152 /* fall through */
1120 case 7: 1153 case 7:
1121 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1154 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1122 val = vcpu->arch.dr7; 1155 *dest = vcpu->arch.dr7;
1123 else 1156 else
1124 val = svm->vmcb->save.dr7; 1157 *dest = svm->vmcb->save.dr7;
1125 break; 1158 break;
1126 default:
1127 val = 0;
1128 } 1159 }
1129 1160
1130 return val; 1161 return EMULATE_DONE;
1131} 1162}
1132 1163
1133static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, 1164static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
1134 int *exception)
1135{ 1165{
1136 struct vcpu_svm *svm = to_svm(vcpu); 1166 struct vcpu_svm *svm = to_svm(vcpu);
1137 1167
1138 *exception = 0;
1139
1140 switch (dr) { 1168 switch (dr) {
1141 case 0 ... 3: 1169 case 0 ... 3:
1142 vcpu->arch.db[dr] = value; 1170 vcpu->arch.db[dr] = value;
1143 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 1171 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1144 vcpu->arch.eff_db[dr] = value; 1172 vcpu->arch.eff_db[dr] = value;
1145 return; 1173 break;
1146 case 4 ... 5: 1174 case 4:
1147 if (vcpu->arch.cr4 & X86_CR4_DE) 1175 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1148 *exception = UD_VECTOR; 1176 return EMULATE_FAIL; /* will re-inject UD */
1149 return; 1177 /* fall through */
1150 case 6: 1178 case 6:
1151 if (value & 0xffffffff00000000ULL) {
1152 *exception = GP_VECTOR;
1153 return;
1154 }
1155 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; 1179 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
1156 return; 1180 break;
1181 case 5:
1182 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1183 return EMULATE_FAIL; /* will re-inject UD */
1184 /* fall through */
1157 case 7: 1185 case 7:
1158 if (value & 0xffffffff00000000ULL) {
1159 *exception = GP_VECTOR;
1160 return;
1161 }
1162 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; 1186 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
1163 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 1187 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1164 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1188 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1165 vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); 1189 vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
1166 } 1190 }
1167 return; 1191 break;
1168 default:
1169 /* FIXME: Possible case? */
1170 printk(KERN_DEBUG "%s: unexpected dr %u\n",
1171 __func__, dr);
1172 *exception = UD_VECTOR;
1173 return;
1174 } 1192 }
1193
1194 return EMULATE_DONE;
1175} 1195}
1176 1196
1177static int pf_interception(struct vcpu_svm *svm) 1197static int pf_interception(struct vcpu_svm *svm)
@@ -1239,13 +1259,17 @@ static int ud_interception(struct vcpu_svm *svm)
1239 return 1; 1259 return 1;
1240} 1260}
1241 1261
1242static int nm_interception(struct vcpu_svm *svm) 1262static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1243{ 1263{
1264 struct vcpu_svm *svm = to_svm(vcpu);
1244 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1265 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
1245 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
1246 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
1247 svm->vcpu.fpu_active = 1; 1266 svm->vcpu.fpu_active = 1;
1267 update_cr0_intercept(svm);
1268}
1248 1269
1270static int nm_interception(struct vcpu_svm *svm)
1271{
1272 svm_fpu_activate(&svm->vcpu);
1249 return 1; 1273 return 1;
1250} 1274}
1251 1275
@@ -1337,7 +1361,7 @@ static int vmmcall_interception(struct vcpu_svm *svm)
1337 1361
1338static int nested_svm_check_permissions(struct vcpu_svm *svm) 1362static int nested_svm_check_permissions(struct vcpu_svm *svm)
1339{ 1363{
1340 if (!(svm->vcpu.arch.shadow_efer & EFER_SVME) 1364 if (!(svm->vcpu.arch.efer & EFER_SVME)
1341 || !is_paging(&svm->vcpu)) { 1365 || !is_paging(&svm->vcpu)) {
1342 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1366 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1343 return 1; 1367 return 1;
@@ -1740,8 +1764,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1740 hsave->save.ds = vmcb->save.ds; 1764 hsave->save.ds = vmcb->save.ds;
1741 hsave->save.gdtr = vmcb->save.gdtr; 1765 hsave->save.gdtr = vmcb->save.gdtr;
1742 hsave->save.idtr = vmcb->save.idtr; 1766 hsave->save.idtr = vmcb->save.idtr;
1743 hsave->save.efer = svm->vcpu.arch.shadow_efer; 1767 hsave->save.efer = svm->vcpu.arch.efer;
1744 hsave->save.cr0 = svm->vcpu.arch.cr0; 1768 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
1745 hsave->save.cr4 = svm->vcpu.arch.cr4; 1769 hsave->save.cr4 = svm->vcpu.arch.cr4;
1746 hsave->save.rflags = vmcb->save.rflags; 1770 hsave->save.rflags = vmcb->save.rflags;
1747 hsave->save.rip = svm->next_rip; 1771 hsave->save.rip = svm->next_rip;
@@ -2153,9 +2177,10 @@ static int rdmsr_interception(struct vcpu_svm *svm)
2153 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2177 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2154 u64 data; 2178 u64 data;
2155 2179
2156 if (svm_get_msr(&svm->vcpu, ecx, &data)) 2180 if (svm_get_msr(&svm->vcpu, ecx, &data)) {
2181 trace_kvm_msr_read_ex(ecx);
2157 kvm_inject_gp(&svm->vcpu, 0); 2182 kvm_inject_gp(&svm->vcpu, 0);
2158 else { 2183 } else {
2159 trace_kvm_msr_read(ecx, data); 2184 trace_kvm_msr_read(ecx, data);
2160 2185
2161 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; 2186 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
@@ -2247,13 +2272,15 @@ static int wrmsr_interception(struct vcpu_svm *svm)
2247 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 2272 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
2248 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 2273 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2249 2274
2250 trace_kvm_msr_write(ecx, data);
2251 2275
2252 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 2276 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2253 if (svm_set_msr(&svm->vcpu, ecx, data)) 2277 if (svm_set_msr(&svm->vcpu, ecx, data)) {
2278 trace_kvm_msr_write_ex(ecx, data);
2254 kvm_inject_gp(&svm->vcpu, 0); 2279 kvm_inject_gp(&svm->vcpu, 0);
2255 else 2280 } else {
2281 trace_kvm_msr_write(ecx, data);
2256 skip_emulated_instruction(&svm->vcpu); 2282 skip_emulated_instruction(&svm->vcpu);
2283 }
2257 return 1; 2284 return 1;
2258} 2285}
2259 2286
@@ -2297,7 +2324,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2297 [SVM_EXIT_READ_CR3] = emulate_on_interception, 2324 [SVM_EXIT_READ_CR3] = emulate_on_interception,
2298 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2325 [SVM_EXIT_READ_CR4] = emulate_on_interception,
2299 [SVM_EXIT_READ_CR8] = emulate_on_interception, 2326 [SVM_EXIT_READ_CR8] = emulate_on_interception,
2300 /* for now: */ 2327 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2301 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 2328 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
2302 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 2329 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
2303 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 2330 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
@@ -2306,11 +2333,17 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2306 [SVM_EXIT_READ_DR1] = emulate_on_interception, 2333 [SVM_EXIT_READ_DR1] = emulate_on_interception,
2307 [SVM_EXIT_READ_DR2] = emulate_on_interception, 2334 [SVM_EXIT_READ_DR2] = emulate_on_interception,
2308 [SVM_EXIT_READ_DR3] = emulate_on_interception, 2335 [SVM_EXIT_READ_DR3] = emulate_on_interception,
2336 [SVM_EXIT_READ_DR4] = emulate_on_interception,
2337 [SVM_EXIT_READ_DR5] = emulate_on_interception,
2338 [SVM_EXIT_READ_DR6] = emulate_on_interception,
2339 [SVM_EXIT_READ_DR7] = emulate_on_interception,
2309 [SVM_EXIT_WRITE_DR0] = emulate_on_interception, 2340 [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
2310 [SVM_EXIT_WRITE_DR1] = emulate_on_interception, 2341 [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
2311 [SVM_EXIT_WRITE_DR2] = emulate_on_interception, 2342 [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
2312 [SVM_EXIT_WRITE_DR3] = emulate_on_interception, 2343 [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
2344 [SVM_EXIT_WRITE_DR4] = emulate_on_interception,
2313 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 2345 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
2346 [SVM_EXIT_WRITE_DR6] = emulate_on_interception,
2314 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 2347 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
2315 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 2348 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2316 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 2349 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
@@ -2383,20 +2416,10 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2383 2416
2384 svm_complete_interrupts(svm); 2417 svm_complete_interrupts(svm);
2385 2418
2386 if (npt_enabled) { 2419 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
2387 int mmu_reload = 0;
2388 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
2389 svm_set_cr0(vcpu, svm->vmcb->save.cr0);
2390 mmu_reload = 1;
2391 }
2392 vcpu->arch.cr0 = svm->vmcb->save.cr0; 2420 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2421 if (npt_enabled)
2393 vcpu->arch.cr3 = svm->vmcb->save.cr3; 2422 vcpu->arch.cr3 = svm->vmcb->save.cr3;
2394 if (mmu_reload) {
2395 kvm_mmu_reset_context(vcpu);
2396 kvm_mmu_load(vcpu);
2397 }
2398 }
2399
2400 2423
2401 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 2424 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
2402 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2425 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2798,12 +2821,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
2798 2821
2799 svm->vmcb->save.cr3 = root; 2822 svm->vmcb->save.cr3 = root;
2800 force_new_asid(vcpu); 2823 force_new_asid(vcpu);
2801
2802 if (vcpu->fpu_active) {
2803 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
2804 svm->vmcb->save.cr0 |= X86_CR0_TS;
2805 vcpu->fpu_active = 0;
2806 }
2807} 2824}
2808 2825
2809static int is_disabled(void) 2826static int is_disabled(void)
@@ -2852,6 +2869,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
2852 return 0; 2869 return 0;
2853} 2870}
2854 2871
2872static void svm_cpuid_update(struct kvm_vcpu *vcpu)
2873{
2874}
2875
2855static const struct trace_print_flags svm_exit_reasons_str[] = { 2876static const struct trace_print_flags svm_exit_reasons_str[] = {
2856 { SVM_EXIT_READ_CR0, "read_cr0" }, 2877 { SVM_EXIT_READ_CR0, "read_cr0" },
2857 { SVM_EXIT_READ_CR3, "read_cr3" }, 2878 { SVM_EXIT_READ_CR3, "read_cr3" },
@@ -2905,9 +2926,22 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
2905 { -1, NULL } 2926 { -1, NULL }
2906}; 2927};
2907 2928
2908static bool svm_gb_page_enable(void) 2929static int svm_get_lpage_level(void)
2909{ 2930{
2910 return true; 2931 return PT_PDPE_LEVEL;
2932}
2933
2934static bool svm_rdtscp_supported(void)
2935{
2936 return false;
2937}
2938
2939static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
2940{
2941 struct vcpu_svm *svm = to_svm(vcpu);
2942
2943 update_cr0_intercept(svm);
2944 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
2911} 2945}
2912 2946
2913static struct kvm_x86_ops svm_x86_ops = { 2947static struct kvm_x86_ops svm_x86_ops = {
@@ -2936,6 +2970,7 @@ static struct kvm_x86_ops svm_x86_ops = {
2936 .set_segment = svm_set_segment, 2970 .set_segment = svm_set_segment,
2937 .get_cpl = svm_get_cpl, 2971 .get_cpl = svm_get_cpl,
2938 .get_cs_db_l_bits = kvm_get_cs_db_l_bits, 2972 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
2973 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
2939 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, 2974 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
2940 .set_cr0 = svm_set_cr0, 2975 .set_cr0 = svm_set_cr0,
2941 .set_cr3 = svm_set_cr3, 2976 .set_cr3 = svm_set_cr3,
@@ -2950,6 +2985,8 @@ static struct kvm_x86_ops svm_x86_ops = {
2950 .cache_reg = svm_cache_reg, 2985 .cache_reg = svm_cache_reg,
2951 .get_rflags = svm_get_rflags, 2986 .get_rflags = svm_get_rflags,
2952 .set_rflags = svm_set_rflags, 2987 .set_rflags = svm_set_rflags,
2988 .fpu_activate = svm_fpu_activate,
2989 .fpu_deactivate = svm_fpu_deactivate,
2953 2990
2954 .tlb_flush = svm_flush_tlb, 2991 .tlb_flush = svm_flush_tlb,
2955 2992
@@ -2975,7 +3012,11 @@ static struct kvm_x86_ops svm_x86_ops = {
2975 .get_mt_mask = svm_get_mt_mask, 3012 .get_mt_mask = svm_get_mt_mask,
2976 3013
2977 .exit_reasons_str = svm_exit_reasons_str, 3014 .exit_reasons_str = svm_exit_reasons_str,
2978 .gb_page_enable = svm_gb_page_enable, 3015 .get_lpage_level = svm_get_lpage_level,
3016
3017 .cpuid_update = svm_cpuid_update,
3018
3019 .rdtscp_supported = svm_rdtscp_supported,
2979}; 3020};
2980 3021
2981static int __init svm_init(void) 3022static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 816e0449db0b..6ad30a29f044 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -56,6 +56,38 @@ TRACE_EVENT(kvm_hypercall,
56); 56);
57 57
58/* 58/*
59 * Tracepoint for hypercall.
60 */
61TRACE_EVENT(kvm_hv_hypercall,
62 TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx,
63 __u64 ingpa, __u64 outgpa),
64 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
65
66 TP_STRUCT__entry(
67 __field( __u16, code )
68 __field( bool, fast )
69 __field( __u16, rep_cnt )
70 __field( __u16, rep_idx )
71 __field( __u64, ingpa )
72 __field( __u64, outgpa )
73 ),
74
75 TP_fast_assign(
76 __entry->code = code;
77 __entry->fast = fast;
78 __entry->rep_cnt = rep_cnt;
79 __entry->rep_idx = rep_idx;
80 __entry->ingpa = ingpa;
81 __entry->outgpa = outgpa;
82 ),
83
84 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
85 __entry->code, __entry->fast ? "fast" : "slow",
86 __entry->rep_cnt, __entry->rep_idx, __entry->ingpa,
87 __entry->outgpa)
88);
89
90/*
59 * Tracepoint for PIO. 91 * Tracepoint for PIO.
60 */ 92 */
61TRACE_EVENT(kvm_pio, 93TRACE_EVENT(kvm_pio,
@@ -214,28 +246,33 @@ TRACE_EVENT(kvm_page_fault,
214 * Tracepoint for guest MSR access. 246 * Tracepoint for guest MSR access.
215 */ 247 */
216TRACE_EVENT(kvm_msr, 248TRACE_EVENT(kvm_msr,
217 TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), 249 TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception),
218 TP_ARGS(rw, ecx, data), 250 TP_ARGS(write, ecx, data, exception),
219 251
220 TP_STRUCT__entry( 252 TP_STRUCT__entry(
221 __field( unsigned int, rw ) 253 __field( unsigned, write )
222 __field( unsigned int, ecx ) 254 __field( u32, ecx )
223 __field( unsigned long, data ) 255 __field( u64, data )
256 __field( u8, exception )
224 ), 257 ),
225 258
226 TP_fast_assign( 259 TP_fast_assign(
227 __entry->rw = rw; 260 __entry->write = write;
228 __entry->ecx = ecx; 261 __entry->ecx = ecx;
229 __entry->data = data; 262 __entry->data = data;
263 __entry->exception = exception;
230 ), 264 ),
231 265
232 TP_printk("msr_%s %x = 0x%lx", 266 TP_printk("msr_%s %x = 0x%llx%s",
233 __entry->rw ? "write" : "read", 267 __entry->write ? "write" : "read",
234 __entry->ecx, __entry->data) 268 __entry->ecx, __entry->data,
269 __entry->exception ? " (#GP)" : "")
235); 270);
236 271
237#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) 272#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false)
238#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) 273#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false)
274#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true)
275#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true)
239 276
240/* 277/*
241 * Tracepoint for guest CR access. 278 * Tracepoint for guest CR access.
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d4918d6fc924..14873b9f8430 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,6 +61,21 @@ module_param_named(unrestricted_guest,
61static int __read_mostly emulate_invalid_guest_state = 0; 61static int __read_mostly emulate_invalid_guest_state = 0;
62module_param(emulate_invalid_guest_state, bool, S_IRUGO); 62module_param(emulate_invalid_guest_state, bool, S_IRUGO);
63 63
64#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
65 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
66#define KVM_GUEST_CR0_MASK \
67 (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
68#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
69 (X86_CR0_WP | X86_CR0_NE)
70#define KVM_VM_CR0_ALWAYS_ON \
71 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
72#define KVM_CR4_GUEST_OWNED_BITS \
73 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
74 | X86_CR4_OSXMMEXCPT)
75
76#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
77#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
78
64/* 79/*
65 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 80 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
66 * ple_gap: upper bound on the amount of time between two successive 81 * ple_gap: upper bound on the amount of time between two successive
@@ -136,6 +151,8 @@ struct vcpu_vmx {
136 ktime_t entry_time; 151 ktime_t entry_time;
137 s64 vnmi_blocked_time; 152 s64 vnmi_blocked_time;
138 u32 exit_reason; 153 u32 exit_reason;
154
155 bool rdtscp_enabled;
139}; 156};
140 157
141static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 158static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -210,7 +227,7 @@ static const u32 vmx_msr_index[] = {
210#ifdef CONFIG_X86_64 227#ifdef CONFIG_X86_64
211 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 228 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
212#endif 229#endif
213 MSR_EFER, MSR_K6_STAR, 230 MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,
214}; 231};
215#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 232#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
216 233
@@ -301,6 +318,11 @@ static inline bool cpu_has_vmx_ept_2m_page(void)
301 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); 318 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
302} 319}
303 320
321static inline bool cpu_has_vmx_ept_1g_page(void)
322{
323 return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
324}
325
304static inline int cpu_has_vmx_invept_individual_addr(void) 326static inline int cpu_has_vmx_invept_individual_addr(void)
305{ 327{
306 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); 328 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -336,9 +358,7 @@ static inline int cpu_has_vmx_ple(void)
336 358
337static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 359static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
338{ 360{
339 return flexpriority_enabled && 361 return flexpriority_enabled && irqchip_in_kernel(kvm);
340 (cpu_has_vmx_virtualize_apic_accesses()) &&
341 (irqchip_in_kernel(kvm));
342} 362}
343 363
344static inline int cpu_has_vmx_vpid(void) 364static inline int cpu_has_vmx_vpid(void)
@@ -347,6 +367,12 @@ static inline int cpu_has_vmx_vpid(void)
347 SECONDARY_EXEC_ENABLE_VPID; 367 SECONDARY_EXEC_ENABLE_VPID;
348} 368}
349 369
370static inline int cpu_has_vmx_rdtscp(void)
371{
372 return vmcs_config.cpu_based_2nd_exec_ctrl &
373 SECONDARY_EXEC_RDTSCP;
374}
375
350static inline int cpu_has_virtual_nmis(void) 376static inline int cpu_has_virtual_nmis(void)
351{ 377{
352 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 378 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -551,22 +577,18 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
551{ 577{
552 u32 eb; 578 u32 eb;
553 579
554 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); 580 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
555 if (!vcpu->fpu_active) 581 (1u << NM_VECTOR) | (1u << DB_VECTOR);
556 eb |= 1u << NM_VECTOR; 582 if ((vcpu->guest_debug &
557 /* 583 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
558 * Unconditionally intercept #DB so we can maintain dr6 without 584 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
559 * reading it every exit. 585 eb |= 1u << BP_VECTOR;
560 */
561 eb |= 1u << DB_VECTOR;
562 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
563 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
564 eb |= 1u << BP_VECTOR;
565 }
566 if (to_vmx(vcpu)->rmode.vm86_active) 586 if (to_vmx(vcpu)->rmode.vm86_active)
567 eb = ~0; 587 eb = ~0;
568 if (enable_ept) 588 if (enable_ept)
569 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 589 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
590 if (vcpu->fpu_active)
591 eb &= ~(1u << NM_VECTOR);
570 vmcs_write32(EXCEPTION_BITMAP, eb); 592 vmcs_write32(EXCEPTION_BITMAP, eb);
571} 593}
572 594
@@ -589,7 +611,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
589 u64 guest_efer; 611 u64 guest_efer;
590 u64 ignore_bits; 612 u64 ignore_bits;
591 613
592 guest_efer = vmx->vcpu.arch.shadow_efer; 614 guest_efer = vmx->vcpu.arch.efer;
593 615
594 /* 616 /*
595 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 617 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -767,22 +789,30 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
767 789
768static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 790static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
769{ 791{
792 ulong cr0;
793
770 if (vcpu->fpu_active) 794 if (vcpu->fpu_active)
771 return; 795 return;
772 vcpu->fpu_active = 1; 796 vcpu->fpu_active = 1;
773 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); 797 cr0 = vmcs_readl(GUEST_CR0);
774 if (vcpu->arch.cr0 & X86_CR0_TS) 798 cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
775 vmcs_set_bits(GUEST_CR0, X86_CR0_TS); 799 cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
800 vmcs_writel(GUEST_CR0, cr0);
776 update_exception_bitmap(vcpu); 801 update_exception_bitmap(vcpu);
802 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
803 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
777} 804}
778 805
806static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
807
779static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) 808static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
780{ 809{
781 if (!vcpu->fpu_active) 810 vmx_decache_cr0_guest_bits(vcpu);
782 return; 811 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
783 vcpu->fpu_active = 0;
784 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
785 update_exception_bitmap(vcpu); 812 update_exception_bitmap(vcpu);
813 vcpu->arch.cr0_guest_owned_bits = 0;
814 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
815 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
786} 816}
787 817
788static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 818static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -878,6 +908,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
878 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 908 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
879} 909}
880 910
911static bool vmx_rdtscp_supported(void)
912{
913 return cpu_has_vmx_rdtscp();
914}
915
881/* 916/*
882 * Swap MSR entry in host/guest MSR entry array. 917 * Swap MSR entry in host/guest MSR entry array.
883 */ 918 */
@@ -913,12 +948,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
913 index = __find_msr_index(vmx, MSR_CSTAR); 948 index = __find_msr_index(vmx, MSR_CSTAR);
914 if (index >= 0) 949 if (index >= 0)
915 move_msr_up(vmx, index, save_nmsrs++); 950 move_msr_up(vmx, index, save_nmsrs++);
951 index = __find_msr_index(vmx, MSR_TSC_AUX);
952 if (index >= 0 && vmx->rdtscp_enabled)
953 move_msr_up(vmx, index, save_nmsrs++);
916 /* 954 /*
917 * MSR_K6_STAR is only needed on long mode guests, and only 955 * MSR_K6_STAR is only needed on long mode guests, and only
918 * if efer.sce is enabled. 956 * if efer.sce is enabled.
919 */ 957 */
920 index = __find_msr_index(vmx, MSR_K6_STAR); 958 index = __find_msr_index(vmx, MSR_K6_STAR);
921 if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) 959 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
922 move_msr_up(vmx, index, save_nmsrs++); 960 move_msr_up(vmx, index, save_nmsrs++);
923 } 961 }
924#endif 962#endif
@@ -1002,6 +1040,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1002 case MSR_IA32_SYSENTER_ESP: 1040 case MSR_IA32_SYSENTER_ESP:
1003 data = vmcs_readl(GUEST_SYSENTER_ESP); 1041 data = vmcs_readl(GUEST_SYSENTER_ESP);
1004 break; 1042 break;
1043 case MSR_TSC_AUX:
1044 if (!to_vmx(vcpu)->rdtscp_enabled)
1045 return 1;
1046 /* Otherwise falls through */
1005 default: 1047 default:
1006 vmx_load_host_state(to_vmx(vcpu)); 1048 vmx_load_host_state(to_vmx(vcpu));
1007 msr = find_msr_entry(to_vmx(vcpu), msr_index); 1049 msr = find_msr_entry(to_vmx(vcpu), msr_index);
@@ -1065,7 +1107,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1065 vcpu->arch.pat = data; 1107 vcpu->arch.pat = data;
1066 break; 1108 break;
1067 } 1109 }
1068 /* Otherwise falls through to kvm_set_msr_common */ 1110 ret = kvm_set_msr_common(vcpu, msr_index, data);
1111 break;
1112 case MSR_TSC_AUX:
1113 if (!vmx->rdtscp_enabled)
1114 return 1;
1115 /* Check reserved bit, higher 32 bits should be zero */
1116 if ((data >> 32) != 0)
1117 return 1;
1118 /* Otherwise falls through */
1069 default: 1119 default:
1070 msr = find_msr_entry(vmx, msr_index); 1120 msr = find_msr_entry(vmx, msr_index);
1071 if (msr) { 1121 if (msr) {
@@ -1224,6 +1274,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1224 CPU_BASED_USE_IO_BITMAPS | 1274 CPU_BASED_USE_IO_BITMAPS |
1225 CPU_BASED_MOV_DR_EXITING | 1275 CPU_BASED_MOV_DR_EXITING |
1226 CPU_BASED_USE_TSC_OFFSETING | 1276 CPU_BASED_USE_TSC_OFFSETING |
1277 CPU_BASED_MWAIT_EXITING |
1278 CPU_BASED_MONITOR_EXITING |
1227 CPU_BASED_INVLPG_EXITING; 1279 CPU_BASED_INVLPG_EXITING;
1228 opt = CPU_BASED_TPR_SHADOW | 1280 opt = CPU_BASED_TPR_SHADOW |
1229 CPU_BASED_USE_MSR_BITMAPS | 1281 CPU_BASED_USE_MSR_BITMAPS |
@@ -1243,7 +1295,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1243 SECONDARY_EXEC_ENABLE_VPID | 1295 SECONDARY_EXEC_ENABLE_VPID |
1244 SECONDARY_EXEC_ENABLE_EPT | 1296 SECONDARY_EXEC_ENABLE_EPT |
1245 SECONDARY_EXEC_UNRESTRICTED_GUEST | 1297 SECONDARY_EXEC_UNRESTRICTED_GUEST |
1246 SECONDARY_EXEC_PAUSE_LOOP_EXITING; 1298 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
1299 SECONDARY_EXEC_RDTSCP;
1247 if (adjust_vmx_controls(min2, opt2, 1300 if (adjust_vmx_controls(min2, opt2,
1248 MSR_IA32_VMX_PROCBASED_CTLS2, 1301 MSR_IA32_VMX_PROCBASED_CTLS2,
1249 &_cpu_based_2nd_exec_control) < 0) 1302 &_cpu_based_2nd_exec_control) < 0)
@@ -1457,8 +1510,12 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1457static gva_t rmode_tss_base(struct kvm *kvm) 1510static gva_t rmode_tss_base(struct kvm *kvm)
1458{ 1511{
1459 if (!kvm->arch.tss_addr) { 1512 if (!kvm->arch.tss_addr) {
1460 gfn_t base_gfn = kvm->memslots[0].base_gfn + 1513 struct kvm_memslots *slots;
1461 kvm->memslots[0].npages - 3; 1514 gfn_t base_gfn;
1515
1516 slots = rcu_dereference(kvm->memslots);
1517 base_gfn = kvm->memslots->memslots[0].base_gfn +
1518 kvm->memslots->memslots[0].npages - 3;
1462 return base_gfn << PAGE_SHIFT; 1519 return base_gfn << PAGE_SHIFT;
1463 } 1520 }
1464 return kvm->arch.tss_addr; 1521 return kvm->arch.tss_addr;
@@ -1544,9 +1601,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1544 * of this msr depends on is_long_mode(). 1601 * of this msr depends on is_long_mode().
1545 */ 1602 */
1546 vmx_load_host_state(to_vmx(vcpu)); 1603 vmx_load_host_state(to_vmx(vcpu));
1547 vcpu->arch.shadow_efer = efer; 1604 vcpu->arch.efer = efer;
1548 if (!msr)
1549 return;
1550 if (efer & EFER_LMA) { 1605 if (efer & EFER_LMA) {
1551 vmcs_write32(VM_ENTRY_CONTROLS, 1606 vmcs_write32(VM_ENTRY_CONTROLS,
1552 vmcs_read32(VM_ENTRY_CONTROLS) | 1607 vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1576,13 +1631,13 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1576 (guest_tr_ar & ~AR_TYPE_MASK) 1631 (guest_tr_ar & ~AR_TYPE_MASK)
1577 | AR_TYPE_BUSY_64_TSS); 1632 | AR_TYPE_BUSY_64_TSS);
1578 } 1633 }
1579 vcpu->arch.shadow_efer |= EFER_LMA; 1634 vcpu->arch.efer |= EFER_LMA;
1580 vmx_set_efer(vcpu, vcpu->arch.shadow_efer); 1635 vmx_set_efer(vcpu, vcpu->arch.efer);
1581} 1636}
1582 1637
1583static void exit_lmode(struct kvm_vcpu *vcpu) 1638static void exit_lmode(struct kvm_vcpu *vcpu)
1584{ 1639{
1585 vcpu->arch.shadow_efer &= ~EFER_LMA; 1640 vcpu->arch.efer &= ~EFER_LMA;
1586 1641
1587 vmcs_write32(VM_ENTRY_CONTROLS, 1642 vmcs_write32(VM_ENTRY_CONTROLS,
1588 vmcs_read32(VM_ENTRY_CONTROLS) 1643 vmcs_read32(VM_ENTRY_CONTROLS)
@@ -1598,10 +1653,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1598 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 1653 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1599} 1654}
1600 1655
1656static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1657{
1658 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
1659
1660 vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
1661 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
1662}
1663
1601static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1664static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1602{ 1665{
1603 vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; 1666 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
1604 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; 1667
1668 vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
1669 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
1605} 1670}
1606 1671
1607static void ept_load_pdptrs(struct kvm_vcpu *vcpu) 1672static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -1646,7 +1711,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1646 (CPU_BASED_CR3_LOAD_EXITING | 1711 (CPU_BASED_CR3_LOAD_EXITING |
1647 CPU_BASED_CR3_STORE_EXITING)); 1712 CPU_BASED_CR3_STORE_EXITING));
1648 vcpu->arch.cr0 = cr0; 1713 vcpu->arch.cr0 = cr0;
1649 vmx_set_cr4(vcpu, vcpu->arch.cr4); 1714 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1650 } else if (!is_paging(vcpu)) { 1715 } else if (!is_paging(vcpu)) {
1651 /* From nonpaging to paging */ 1716 /* From nonpaging to paging */
1652 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1717 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1654,23 +1719,13 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1654 ~(CPU_BASED_CR3_LOAD_EXITING | 1719 ~(CPU_BASED_CR3_LOAD_EXITING |
1655 CPU_BASED_CR3_STORE_EXITING)); 1720 CPU_BASED_CR3_STORE_EXITING));
1656 vcpu->arch.cr0 = cr0; 1721 vcpu->arch.cr0 = cr0;
1657 vmx_set_cr4(vcpu, vcpu->arch.cr4); 1722 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1658 } 1723 }
1659 1724
1660 if (!(cr0 & X86_CR0_WP)) 1725 if (!(cr0 & X86_CR0_WP))
1661 *hw_cr0 &= ~X86_CR0_WP; 1726 *hw_cr0 &= ~X86_CR0_WP;
1662} 1727}
1663 1728
1664static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
1665 struct kvm_vcpu *vcpu)
1666{
1667 if (!is_paging(vcpu)) {
1668 *hw_cr4 &= ~X86_CR4_PAE;
1669 *hw_cr4 |= X86_CR4_PSE;
1670 } else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
1671 *hw_cr4 &= ~X86_CR4_PAE;
1672}
1673
1674static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1729static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1675{ 1730{
1676 struct vcpu_vmx *vmx = to_vmx(vcpu); 1731 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1682,8 +1737,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1682 else 1737 else
1683 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; 1738 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
1684 1739
1685 vmx_fpu_deactivate(vcpu);
1686
1687 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 1740 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
1688 enter_pmode(vcpu); 1741 enter_pmode(vcpu);
1689 1742
@@ -1691,7 +1744,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1691 enter_rmode(vcpu); 1744 enter_rmode(vcpu);
1692 1745
1693#ifdef CONFIG_X86_64 1746#ifdef CONFIG_X86_64
1694 if (vcpu->arch.shadow_efer & EFER_LME) { 1747 if (vcpu->arch.efer & EFER_LME) {
1695 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) 1748 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1696 enter_lmode(vcpu); 1749 enter_lmode(vcpu);
1697 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) 1750 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
@@ -1702,12 +1755,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1702 if (enable_ept) 1755 if (enable_ept)
1703 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 1756 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
1704 1757
1758 if (!vcpu->fpu_active)
1759 hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
1760
1705 vmcs_writel(CR0_READ_SHADOW, cr0); 1761 vmcs_writel(CR0_READ_SHADOW, cr0);
1706 vmcs_writel(GUEST_CR0, hw_cr0); 1762 vmcs_writel(GUEST_CR0, hw_cr0);
1707 vcpu->arch.cr0 = cr0; 1763 vcpu->arch.cr0 = cr0;
1708
1709 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1710 vmx_fpu_activate(vcpu);
1711} 1764}
1712 1765
1713static u64 construct_eptp(unsigned long root_hpa) 1766static u64 construct_eptp(unsigned long root_hpa)
@@ -1738,8 +1791,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1738 1791
1739 vmx_flush_tlb(vcpu); 1792 vmx_flush_tlb(vcpu);
1740 vmcs_writel(GUEST_CR3, guest_cr3); 1793 vmcs_writel(GUEST_CR3, guest_cr3);
1741 if (vcpu->arch.cr0 & X86_CR0_PE)
1742 vmx_fpu_deactivate(vcpu);
1743} 1794}
1744 1795
1745static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1796static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1748,8 +1799,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1748 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 1799 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
1749 1800
1750 vcpu->arch.cr4 = cr4; 1801 vcpu->arch.cr4 = cr4;
1751 if (enable_ept) 1802 if (enable_ept) {
1752 ept_update_paging_mode_cr4(&hw_cr4, vcpu); 1803 if (!is_paging(vcpu)) {
1804 hw_cr4 &= ~X86_CR4_PAE;
1805 hw_cr4 |= X86_CR4_PSE;
1806 } else if (!(cr4 & X86_CR4_PAE)) {
1807 hw_cr4 &= ~X86_CR4_PAE;
1808 }
1809 }
1753 1810
1754 vmcs_writel(CR4_READ_SHADOW, cr4); 1811 vmcs_writel(CR4_READ_SHADOW, cr4);
1755 vmcs_writel(GUEST_CR4, hw_cr4); 1812 vmcs_writel(GUEST_CR4, hw_cr4);
@@ -1787,7 +1844,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
1787 1844
1788static int vmx_get_cpl(struct kvm_vcpu *vcpu) 1845static int vmx_get_cpl(struct kvm_vcpu *vcpu)
1789{ 1846{
1790 if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ 1847 if (!is_protmode(vcpu))
1791 return 0; 1848 return 0;
1792 1849
1793 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ 1850 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
@@ -2042,7 +2099,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
2042static bool guest_state_valid(struct kvm_vcpu *vcpu) 2099static bool guest_state_valid(struct kvm_vcpu *vcpu)
2043{ 2100{
2044 /* real mode guest state checks */ 2101 /* real mode guest state checks */
2045 if (!(vcpu->arch.cr0 & X86_CR0_PE)) { 2102 if (!is_protmode(vcpu)) {
2046 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 2103 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
2047 return false; 2104 return false;
2048 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 2105 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
@@ -2175,7 +2232,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
2175 struct kvm_userspace_memory_region kvm_userspace_mem; 2232 struct kvm_userspace_memory_region kvm_userspace_mem;
2176 int r = 0; 2233 int r = 0;
2177 2234
2178 down_write(&kvm->slots_lock); 2235 mutex_lock(&kvm->slots_lock);
2179 if (kvm->arch.apic_access_page) 2236 if (kvm->arch.apic_access_page)
2180 goto out; 2237 goto out;
2181 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; 2238 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -2188,7 +2245,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
2188 2245
2189 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); 2246 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
2190out: 2247out:
2191 up_write(&kvm->slots_lock); 2248 mutex_unlock(&kvm->slots_lock);
2192 return r; 2249 return r;
2193} 2250}
2194 2251
@@ -2197,7 +2254,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
2197 struct kvm_userspace_memory_region kvm_userspace_mem; 2254 struct kvm_userspace_memory_region kvm_userspace_mem;
2198 int r = 0; 2255 int r = 0;
2199 2256
2200 down_write(&kvm->slots_lock); 2257 mutex_lock(&kvm->slots_lock);
2201 if (kvm->arch.ept_identity_pagetable) 2258 if (kvm->arch.ept_identity_pagetable)
2202 goto out; 2259 goto out;
2203 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; 2260 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
@@ -2212,7 +2269,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
2212 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, 2269 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
2213 kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); 2270 kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
2214out: 2271out:
2215 up_write(&kvm->slots_lock); 2272 mutex_unlock(&kvm->slots_lock);
2216 return r; 2273 return r;
2217} 2274}
2218 2275
@@ -2384,14 +2441,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2384 for (i = 0; i < NR_VMX_MSR; ++i) { 2441 for (i = 0; i < NR_VMX_MSR; ++i) {
2385 u32 index = vmx_msr_index[i]; 2442 u32 index = vmx_msr_index[i];
2386 u32 data_low, data_high; 2443 u32 data_low, data_high;
2387 u64 data;
2388 int j = vmx->nmsrs; 2444 int j = vmx->nmsrs;
2389 2445
2390 if (rdmsr_safe(index, &data_low, &data_high) < 0) 2446 if (rdmsr_safe(index, &data_low, &data_high) < 0)
2391 continue; 2447 continue;
2392 if (wrmsr_safe(index, data_low, data_high) < 0) 2448 if (wrmsr_safe(index, data_low, data_high) < 0)
2393 continue; 2449 continue;
2394 data = data_low | ((u64)data_high << 32);
2395 vmx->guest_msrs[j].index = i; 2450 vmx->guest_msrs[j].index = i;
2396 vmx->guest_msrs[j].data = 0; 2451 vmx->guest_msrs[j].data = 0;
2397 vmx->guest_msrs[j].mask = -1ull; 2452 vmx->guest_msrs[j].mask = -1ull;
@@ -2404,7 +2459,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2404 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); 2459 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
2405 2460
2406 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 2461 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
2407 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); 2462 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
2463 if (enable_ept)
2464 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
2465 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
2408 2466
2409 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; 2467 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
2410 rdtscll(tsc_this); 2468 rdtscll(tsc_this);
@@ -2429,10 +2487,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2429{ 2487{
2430 struct vcpu_vmx *vmx = to_vmx(vcpu); 2488 struct vcpu_vmx *vmx = to_vmx(vcpu);
2431 u64 msr; 2489 u64 msr;
2432 int ret; 2490 int ret, idx;
2433 2491
2434 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2492 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2435 down_read(&vcpu->kvm->slots_lock); 2493 idx = srcu_read_lock(&vcpu->kvm->srcu);
2436 if (!init_rmode(vmx->vcpu.kvm)) { 2494 if (!init_rmode(vmx->vcpu.kvm)) {
2437 ret = -ENOMEM; 2495 ret = -ENOMEM;
2438 goto out; 2496 goto out;
@@ -2526,7 +2584,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2526 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2584 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2527 2585
2528 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 2586 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
2529 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ 2587 vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
2530 vmx_set_cr4(&vmx->vcpu, 0); 2588 vmx_set_cr4(&vmx->vcpu, 0);
2531 vmx_set_efer(&vmx->vcpu, 0); 2589 vmx_set_efer(&vmx->vcpu, 0);
2532 vmx_fpu_activate(&vmx->vcpu); 2590 vmx_fpu_activate(&vmx->vcpu);
@@ -2540,7 +2598,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2540 vmx->emulation_required = 0; 2598 vmx->emulation_required = 0;
2541 2599
2542out: 2600out:
2543 up_read(&vcpu->kvm->slots_lock); 2601 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2544 return ret; 2602 return ret;
2545} 2603}
2546 2604
@@ -2717,6 +2775,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2717 kvm_queue_exception(vcpu, vec); 2775 kvm_queue_exception(vcpu, vec);
2718 return 1; 2776 return 1;
2719 case BP_VECTOR: 2777 case BP_VECTOR:
2778 /*
2779 * Update instruction length as we may reinject the exception
2780 * from user space while in guest debugging mode.
2781 */
2782 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
2783 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2720 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 2784 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
2721 return 0; 2785 return 0;
2722 /* fall through */ 2786 /* fall through */
@@ -2839,6 +2903,13 @@ static int handle_exception(struct kvm_vcpu *vcpu)
2839 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 2903 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
2840 /* fall through */ 2904 /* fall through */
2841 case BP_VECTOR: 2905 case BP_VECTOR:
2906 /*
2907 * Update instruction length as we may reinject #BP from
2908 * user space while in guest debugging mode. Reading it for
2909 * #DB as well causes no harm, it is not used in that case.
2910 */
2911 vmx->vcpu.arch.event_exit_inst_len =
2912 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2842 kvm_run->exit_reason = KVM_EXIT_DEBUG; 2913 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2843 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; 2914 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
2844 kvm_run->debug.arch.exception = ex_no; 2915 kvm_run->debug.arch.exception = ex_no;
@@ -2940,11 +3011,10 @@ static int handle_cr(struct kvm_vcpu *vcpu)
2940 }; 3011 };
2941 break; 3012 break;
2942 case 2: /* clts */ 3013 case 2: /* clts */
2943 vmx_fpu_deactivate(vcpu); 3014 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
2944 vcpu->arch.cr0 &= ~X86_CR0_TS; 3015 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
2945 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
2946 vmx_fpu_activate(vcpu);
2947 skip_emulated_instruction(vcpu); 3016 skip_emulated_instruction(vcpu);
3017 vmx_fpu_activate(vcpu);
2948 return 1; 3018 return 1;
2949 case 1: /*mov from cr*/ 3019 case 1: /*mov from cr*/
2950 switch (cr) { 3020 switch (cr) {
@@ -2962,7 +3032,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
2962 } 3032 }
2963 break; 3033 break;
2964 case 3: /* lmsw */ 3034 case 3: /* lmsw */
2965 kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); 3035 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
3036 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
3037 kvm_lmsw(vcpu, val);
2966 3038
2967 skip_emulated_instruction(vcpu); 3039 skip_emulated_instruction(vcpu);
2968 return 1; 3040 return 1;
@@ -2975,12 +3047,22 @@ static int handle_cr(struct kvm_vcpu *vcpu)
2975 return 0; 3047 return 0;
2976} 3048}
2977 3049
3050static int check_dr_alias(struct kvm_vcpu *vcpu)
3051{
3052 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
3053 kvm_queue_exception(vcpu, UD_VECTOR);
3054 return -1;
3055 }
3056 return 0;
3057}
3058
2978static int handle_dr(struct kvm_vcpu *vcpu) 3059static int handle_dr(struct kvm_vcpu *vcpu)
2979{ 3060{
2980 unsigned long exit_qualification; 3061 unsigned long exit_qualification;
2981 unsigned long val; 3062 unsigned long val;
2982 int dr, reg; 3063 int dr, reg;
2983 3064
3065 /* Do not handle if the CPL > 0, will trigger GP on re-entry */
2984 if (!kvm_require_cpl(vcpu, 0)) 3066 if (!kvm_require_cpl(vcpu, 0))
2985 return 1; 3067 return 1;
2986 dr = vmcs_readl(GUEST_DR7); 3068 dr = vmcs_readl(GUEST_DR7);
@@ -3016,14 +3098,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)
3016 case 0 ... 3: 3098 case 0 ... 3:
3017 val = vcpu->arch.db[dr]; 3099 val = vcpu->arch.db[dr];
3018 break; 3100 break;
3101 case 4:
3102 if (check_dr_alias(vcpu) < 0)
3103 return 1;
3104 /* fall through */
3019 case 6: 3105 case 6:
3020 val = vcpu->arch.dr6; 3106 val = vcpu->arch.dr6;
3021 break; 3107 break;
3022 case 7: 3108 case 5:
3109 if (check_dr_alias(vcpu) < 0)
3110 return 1;
3111 /* fall through */
3112 default: /* 7 */
3023 val = vcpu->arch.dr7; 3113 val = vcpu->arch.dr7;
3024 break; 3114 break;
3025 default:
3026 val = 0;
3027 } 3115 }
3028 kvm_register_write(vcpu, reg, val); 3116 kvm_register_write(vcpu, reg, val);
3029 } else { 3117 } else {
@@ -3034,21 +3122,25 @@ static int handle_dr(struct kvm_vcpu *vcpu)
3034 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 3122 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
3035 vcpu->arch.eff_db[dr] = val; 3123 vcpu->arch.eff_db[dr] = val;
3036 break; 3124 break;
3037 case 4 ... 5: 3125 case 4:
3038 if (vcpu->arch.cr4 & X86_CR4_DE) 3126 if (check_dr_alias(vcpu) < 0)
3039 kvm_queue_exception(vcpu, UD_VECTOR); 3127 return 1;
3040 break; 3128 /* fall through */
3041 case 6: 3129 case 6:
3042 if (val & 0xffffffff00000000ULL) { 3130 if (val & 0xffffffff00000000ULL) {
3043 kvm_queue_exception(vcpu, GP_VECTOR); 3131 kvm_inject_gp(vcpu, 0);
3044 break; 3132 return 1;
3045 } 3133 }
3046 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; 3134 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
3047 break; 3135 break;
3048 case 7: 3136 case 5:
3137 if (check_dr_alias(vcpu) < 0)
3138 return 1;
3139 /* fall through */
3140 default: /* 7 */
3049 if (val & 0xffffffff00000000ULL) { 3141 if (val & 0xffffffff00000000ULL) {
3050 kvm_queue_exception(vcpu, GP_VECTOR); 3142 kvm_inject_gp(vcpu, 0);
3051 break; 3143 return 1;
3052 } 3144 }
3053 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 3145 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
3054 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 3146 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
@@ -3075,6 +3167,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
3075 u64 data; 3167 u64 data;
3076 3168
3077 if (vmx_get_msr(vcpu, ecx, &data)) { 3169 if (vmx_get_msr(vcpu, ecx, &data)) {
3170 trace_kvm_msr_read_ex(ecx);
3078 kvm_inject_gp(vcpu, 0); 3171 kvm_inject_gp(vcpu, 0);
3079 return 1; 3172 return 1;
3080 } 3173 }
@@ -3094,13 +3187,13 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
3094 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 3187 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
3095 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 3188 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
3096 3189
3097 trace_kvm_msr_write(ecx, data);
3098
3099 if (vmx_set_msr(vcpu, ecx, data) != 0) { 3190 if (vmx_set_msr(vcpu, ecx, data) != 0) {
3191 trace_kvm_msr_write_ex(ecx, data);
3100 kvm_inject_gp(vcpu, 0); 3192 kvm_inject_gp(vcpu, 0);
3101 return 1; 3193 return 1;
3102 } 3194 }
3103 3195
3196 trace_kvm_msr_write(ecx, data);
3104 skip_emulated_instruction(vcpu); 3197 skip_emulated_instruction(vcpu);
3105 return 1; 3198 return 1;
3106} 3199}
@@ -3385,7 +3478,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3385 } 3478 }
3386 3479
3387 if (err != EMULATE_DONE) { 3480 if (err != EMULATE_DONE) {
3388 kvm_report_emulation_failure(vcpu, "emulation failure");
3389 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3481 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3390 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 3482 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3391 vcpu->run->internal.ndata = 0; 3483 vcpu->run->internal.ndata = 0;
@@ -3416,6 +3508,12 @@ static int handle_pause(struct kvm_vcpu *vcpu)
3416 return 1; 3508 return 1;
3417} 3509}
3418 3510
3511static int handle_invalid_op(struct kvm_vcpu *vcpu)
3512{
3513 kvm_queue_exception(vcpu, UD_VECTOR);
3514 return 1;
3515}
3516
3419/* 3517/*
3420 * The exit handlers return 1 if the exit was handled fully and guest execution 3518 * The exit handlers return 1 if the exit was handled fully and guest execution
3421 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 3519 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -3453,6 +3551,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3453 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3551 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3454 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 3552 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
3455 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 3553 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
3554 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
3555 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
3456}; 3556};
3457 3557
3458static const int kvm_vmx_max_exit_handlers = 3558static const int kvm_vmx_max_exit_handlers =
@@ -3686,9 +3786,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3686 */ 3786 */
3687 vmcs_writel(HOST_CR0, read_cr0()); 3787 vmcs_writel(HOST_CR0, read_cr0());
3688 3788
3689 if (vcpu->arch.switch_db_regs)
3690 set_debugreg(vcpu->arch.dr6, 6);
3691
3692 asm( 3789 asm(
3693 /* Store host registers */ 3790 /* Store host registers */
3694 "push %%"R"dx; push %%"R"bp;" 3791 "push %%"R"dx; push %%"R"bp;"
@@ -3789,9 +3886,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3789 | (1 << VCPU_EXREG_PDPTR)); 3886 | (1 << VCPU_EXREG_PDPTR));
3790 vcpu->arch.regs_dirty = 0; 3887 vcpu->arch.regs_dirty = 0;
3791 3888
3792 if (vcpu->arch.switch_db_regs)
3793 get_debugreg(vcpu->arch.dr6, 6);
3794
3795 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 3889 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
3796 if (vmx->rmode.irq.pending) 3890 if (vmx->rmode.irq.pending)
3797 fixup_rmode_irq(vmx); 3891 fixup_rmode_irq(vmx);
@@ -3920,7 +4014,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3920 * b. VT-d with snooping control feature: snooping control feature of 4014 * b. VT-d with snooping control feature: snooping control feature of
3921 * VT-d engine can guarantee the cache correctness. Just set it 4015 * VT-d engine can guarantee the cache correctness. Just set it
3922 * to WB to keep consistent with host. So the same as item 3. 4016 * to WB to keep consistent with host. So the same as item 3.
3923 * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep 4017 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
3924 * consistent with host MTRR 4018 * consistent with host MTRR
3925 */ 4019 */
3926 if (is_mmio) 4020 if (is_mmio)
@@ -3931,37 +4025,88 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3931 VMX_EPT_MT_EPTE_SHIFT; 4025 VMX_EPT_MT_EPTE_SHIFT;
3932 else 4026 else
3933 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) 4027 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
3934 | VMX_EPT_IGMT_BIT; 4028 | VMX_EPT_IPAT_BIT;
3935 4029
3936 return ret; 4030 return ret;
3937} 4031}
3938 4032
4033#define _ER(x) { EXIT_REASON_##x, #x }
4034
3939static const struct trace_print_flags vmx_exit_reasons_str[] = { 4035static const struct trace_print_flags vmx_exit_reasons_str[] = {
3940 { EXIT_REASON_EXCEPTION_NMI, "exception" }, 4036 _ER(EXCEPTION_NMI),
3941 { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, 4037 _ER(EXTERNAL_INTERRUPT),
3942 { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, 4038 _ER(TRIPLE_FAULT),
3943 { EXIT_REASON_NMI_WINDOW, "nmi_window" }, 4039 _ER(PENDING_INTERRUPT),
3944 { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, 4040 _ER(NMI_WINDOW),
3945 { EXIT_REASON_CR_ACCESS, "cr_access" }, 4041 _ER(TASK_SWITCH),
3946 { EXIT_REASON_DR_ACCESS, "dr_access" }, 4042 _ER(CPUID),
3947 { EXIT_REASON_CPUID, "cpuid" }, 4043 _ER(HLT),
3948 { EXIT_REASON_MSR_READ, "rdmsr" }, 4044 _ER(INVLPG),
3949 { EXIT_REASON_MSR_WRITE, "wrmsr" }, 4045 _ER(RDPMC),
3950 { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, 4046 _ER(RDTSC),
3951 { EXIT_REASON_HLT, "halt" }, 4047 _ER(VMCALL),
3952 { EXIT_REASON_INVLPG, "invlpg" }, 4048 _ER(VMCLEAR),
3953 { EXIT_REASON_VMCALL, "hypercall" }, 4049 _ER(VMLAUNCH),
3954 { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, 4050 _ER(VMPTRLD),
3955 { EXIT_REASON_APIC_ACCESS, "apic_access" }, 4051 _ER(VMPTRST),
3956 { EXIT_REASON_WBINVD, "wbinvd" }, 4052 _ER(VMREAD),
3957 { EXIT_REASON_TASK_SWITCH, "task_switch" }, 4053 _ER(VMRESUME),
3958 { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, 4054 _ER(VMWRITE),
4055 _ER(VMOFF),
4056 _ER(VMON),
4057 _ER(CR_ACCESS),
4058 _ER(DR_ACCESS),
4059 _ER(IO_INSTRUCTION),
4060 _ER(MSR_READ),
4061 _ER(MSR_WRITE),
4062 _ER(MWAIT_INSTRUCTION),
4063 _ER(MONITOR_INSTRUCTION),
4064 _ER(PAUSE_INSTRUCTION),
4065 _ER(MCE_DURING_VMENTRY),
4066 _ER(TPR_BELOW_THRESHOLD),
4067 _ER(APIC_ACCESS),
4068 _ER(EPT_VIOLATION),
4069 _ER(EPT_MISCONFIG),
4070 _ER(WBINVD),
3959 { -1, NULL } 4071 { -1, NULL }
3960}; 4072};
3961 4073
3962static bool vmx_gb_page_enable(void) 4074#undef _ER
4075
4076static int vmx_get_lpage_level(void)
4077{
4078 if (enable_ept && !cpu_has_vmx_ept_1g_page())
4079 return PT_DIRECTORY_LEVEL;
4080 else
4081 /* For shadow and EPT supported 1GB page */
4082 return PT_PDPE_LEVEL;
4083}
4084
4085static inline u32 bit(int bitno)
4086{
4087 return 1 << (bitno & 31);
4088}
4089
4090static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
3963{ 4091{
3964 return false; 4092 struct kvm_cpuid_entry2 *best;
4093 struct vcpu_vmx *vmx = to_vmx(vcpu);
4094 u32 exec_control;
4095
4096 vmx->rdtscp_enabled = false;
4097 if (vmx_rdtscp_supported()) {
4098 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
4099 if (exec_control & SECONDARY_EXEC_RDTSCP) {
4100 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
4101 if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
4102 vmx->rdtscp_enabled = true;
4103 else {
4104 exec_control &= ~SECONDARY_EXEC_RDTSCP;
4105 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
4106 exec_control);
4107 }
4108 }
4109 }
3965} 4110}
3966 4111
3967static struct kvm_x86_ops vmx_x86_ops = { 4112static struct kvm_x86_ops vmx_x86_ops = {
@@ -3990,6 +4135,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
3990 .set_segment = vmx_set_segment, 4135 .set_segment = vmx_set_segment,
3991 .get_cpl = vmx_get_cpl, 4136 .get_cpl = vmx_get_cpl,
3992 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 4137 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
4138 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
3993 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 4139 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
3994 .set_cr0 = vmx_set_cr0, 4140 .set_cr0 = vmx_set_cr0,
3995 .set_cr3 = vmx_set_cr3, 4141 .set_cr3 = vmx_set_cr3,
@@ -4002,6 +4148,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
4002 .cache_reg = vmx_cache_reg, 4148 .cache_reg = vmx_cache_reg,
4003 .get_rflags = vmx_get_rflags, 4149 .get_rflags = vmx_get_rflags,
4004 .set_rflags = vmx_set_rflags, 4150 .set_rflags = vmx_set_rflags,
4151 .fpu_activate = vmx_fpu_activate,
4152 .fpu_deactivate = vmx_fpu_deactivate,
4005 4153
4006 .tlb_flush = vmx_flush_tlb, 4154 .tlb_flush = vmx_flush_tlb,
4007 4155
@@ -4027,7 +4175,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
4027 .get_mt_mask = vmx_get_mt_mask, 4175 .get_mt_mask = vmx_get_mt_mask,
4028 4176
4029 .exit_reasons_str = vmx_exit_reasons_str, 4177 .exit_reasons_str = vmx_exit_reasons_str,
4030 .gb_page_enable = vmx_gb_page_enable, 4178 .get_lpage_level = vmx_get_lpage_level,
4179
4180 .cpuid_update = vmx_cpuid_update,
4181
4182 .rdtscp_supported = vmx_rdtscp_supported,
4031}; 4183};
4032 4184
4033static int __init vmx_init(void) 4185static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a1e1bc9d412d..e46282a56565 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -38,6 +38,7 @@
38#include <linux/intel-iommu.h> 38#include <linux/intel-iommu.h>
39#include <linux/cpufreq.h> 39#include <linux/cpufreq.h>
40#include <linux/user-return-notifier.h> 40#include <linux/user-return-notifier.h>
41#include <linux/srcu.h>
41#include <trace/events/kvm.h> 42#include <trace/events/kvm.h>
42#undef TRACE_INCLUDE_FILE 43#undef TRACE_INCLUDE_FILE
43#define CREATE_TRACE_POINTS 44#define CREATE_TRACE_POINTS
@@ -93,16 +94,16 @@ module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
93 94
94struct kvm_shared_msrs_global { 95struct kvm_shared_msrs_global {
95 int nr; 96 int nr;
96 struct kvm_shared_msr { 97 u32 msrs[KVM_NR_SHARED_MSRS];
97 u32 msr;
98 u64 value;
99 } msrs[KVM_NR_SHARED_MSRS];
100}; 98};
101 99
102struct kvm_shared_msrs { 100struct kvm_shared_msrs {
103 struct user_return_notifier urn; 101 struct user_return_notifier urn;
104 bool registered; 102 bool registered;
105 u64 current_value[KVM_NR_SHARED_MSRS]; 103 struct kvm_shared_msr_values {
104 u64 host;
105 u64 curr;
106 } values[KVM_NR_SHARED_MSRS];
106}; 107};
107 108
108static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; 109static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
@@ -147,53 +148,64 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
147static void kvm_on_user_return(struct user_return_notifier *urn) 148static void kvm_on_user_return(struct user_return_notifier *urn)
148{ 149{
149 unsigned slot; 150 unsigned slot;
150 struct kvm_shared_msr *global;
151 struct kvm_shared_msrs *locals 151 struct kvm_shared_msrs *locals
152 = container_of(urn, struct kvm_shared_msrs, urn); 152 = container_of(urn, struct kvm_shared_msrs, urn);
153 struct kvm_shared_msr_values *values;
153 154
154 for (slot = 0; slot < shared_msrs_global.nr; ++slot) { 155 for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
155 global = &shared_msrs_global.msrs[slot]; 156 values = &locals->values[slot];
156 if (global->value != locals->current_value[slot]) { 157 if (values->host != values->curr) {
157 wrmsrl(global->msr, global->value); 158 wrmsrl(shared_msrs_global.msrs[slot], values->host);
158 locals->current_value[slot] = global->value; 159 values->curr = values->host;
159 } 160 }
160 } 161 }
161 locals->registered = false; 162 locals->registered = false;
162 user_return_notifier_unregister(urn); 163 user_return_notifier_unregister(urn);
163} 164}
164 165
165void kvm_define_shared_msr(unsigned slot, u32 msr) 166static void shared_msr_update(unsigned slot, u32 msr)
166{ 167{
167 int cpu; 168 struct kvm_shared_msrs *smsr;
168 u64 value; 169 u64 value;
169 170
171 smsr = &__get_cpu_var(shared_msrs);
172 /* only read, and nobody should modify it at this time,
173 * so don't need lock */
174 if (slot >= shared_msrs_global.nr) {
175 printk(KERN_ERR "kvm: invalid MSR slot!");
176 return;
177 }
178 rdmsrl_safe(msr, &value);
179 smsr->values[slot].host = value;
180 smsr->values[slot].curr = value;
181}
182
183void kvm_define_shared_msr(unsigned slot, u32 msr)
184{
170 if (slot >= shared_msrs_global.nr) 185 if (slot >= shared_msrs_global.nr)
171 shared_msrs_global.nr = slot + 1; 186 shared_msrs_global.nr = slot + 1;
172 shared_msrs_global.msrs[slot].msr = msr; 187 shared_msrs_global.msrs[slot] = msr;
173 rdmsrl_safe(msr, &value); 188 /* we need ensured the shared_msr_global have been updated */
174 shared_msrs_global.msrs[slot].value = value; 189 smp_wmb();
175 for_each_online_cpu(cpu)
176 per_cpu(shared_msrs, cpu).current_value[slot] = value;
177} 190}
178EXPORT_SYMBOL_GPL(kvm_define_shared_msr); 191EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
179 192
180static void kvm_shared_msr_cpu_online(void) 193static void kvm_shared_msr_cpu_online(void)
181{ 194{
182 unsigned i; 195 unsigned i;
183 struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
184 196
185 for (i = 0; i < shared_msrs_global.nr; ++i) 197 for (i = 0; i < shared_msrs_global.nr; ++i)
186 locals->current_value[i] = shared_msrs_global.msrs[i].value; 198 shared_msr_update(i, shared_msrs_global.msrs[i]);
187} 199}
188 200
189void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) 201void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
190{ 202{
191 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); 203 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
192 204
193 if (((value ^ smsr->current_value[slot]) & mask) == 0) 205 if (((value ^ smsr->values[slot].curr) & mask) == 0)
194 return; 206 return;
195 smsr->current_value[slot] = value; 207 smsr->values[slot].curr = value;
196 wrmsrl(shared_msrs_global.msrs[slot].msr, value); 208 wrmsrl(shared_msrs_global.msrs[slot], value);
197 if (!smsr->registered) { 209 if (!smsr->registered) {
198 smsr->urn.on_user_return = kvm_on_user_return; 210 smsr->urn.on_user_return = kvm_on_user_return;
199 user_return_notifier_register(&smsr->urn); 211 user_return_notifier_register(&smsr->urn);
@@ -257,12 +269,68 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
257} 269}
258EXPORT_SYMBOL_GPL(kvm_set_apic_base); 270EXPORT_SYMBOL_GPL(kvm_set_apic_base);
259 271
272#define EXCPT_BENIGN 0
273#define EXCPT_CONTRIBUTORY 1
274#define EXCPT_PF 2
275
276static int exception_class(int vector)
277{
278 switch (vector) {
279 case PF_VECTOR:
280 return EXCPT_PF;
281 case DE_VECTOR:
282 case TS_VECTOR:
283 case NP_VECTOR:
284 case SS_VECTOR:
285 case GP_VECTOR:
286 return EXCPT_CONTRIBUTORY;
287 default:
288 break;
289 }
290 return EXCPT_BENIGN;
291}
292
293static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
294 unsigned nr, bool has_error, u32 error_code)
295{
296 u32 prev_nr;
297 int class1, class2;
298
299 if (!vcpu->arch.exception.pending) {
300 queue:
301 vcpu->arch.exception.pending = true;
302 vcpu->arch.exception.has_error_code = has_error;
303 vcpu->arch.exception.nr = nr;
304 vcpu->arch.exception.error_code = error_code;
305 return;
306 }
307
308 /* to check exception */
309 prev_nr = vcpu->arch.exception.nr;
310 if (prev_nr == DF_VECTOR) {
311 /* triple fault -> shutdown */
312 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
313 return;
314 }
315 class1 = exception_class(prev_nr);
316 class2 = exception_class(nr);
317 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
318 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
319 /* generate double fault per SDM Table 5-5 */
320 vcpu->arch.exception.pending = true;
321 vcpu->arch.exception.has_error_code = true;
322 vcpu->arch.exception.nr = DF_VECTOR;
323 vcpu->arch.exception.error_code = 0;
324 } else
325 /* replace previous exception with a new one in a hope
326 that instruction re-execution will regenerate lost
327 exception */
328 goto queue;
329}
330
260void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 331void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
261{ 332{
262 WARN_ON(vcpu->arch.exception.pending); 333 kvm_multiple_exception(vcpu, nr, false, 0);
263 vcpu->arch.exception.pending = true;
264 vcpu->arch.exception.has_error_code = false;
265 vcpu->arch.exception.nr = nr;
266} 334}
267EXPORT_SYMBOL_GPL(kvm_queue_exception); 335EXPORT_SYMBOL_GPL(kvm_queue_exception);
268 336
@@ -270,25 +338,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
270 u32 error_code) 338 u32 error_code)
271{ 339{
272 ++vcpu->stat.pf_guest; 340 ++vcpu->stat.pf_guest;
273
274 if (vcpu->arch.exception.pending) {
275 switch(vcpu->arch.exception.nr) {
276 case DF_VECTOR:
277 /* triple fault -> shutdown */
278 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
279 return;
280 case PF_VECTOR:
281 vcpu->arch.exception.nr = DF_VECTOR;
282 vcpu->arch.exception.error_code = 0;
283 return;
284 default:
285 /* replace previous exception with a new one in a hope
286 that instruction re-execution will regenerate lost
287 exception */
288 vcpu->arch.exception.pending = false;
289 break;
290 }
291 }
292 vcpu->arch.cr2 = addr; 341 vcpu->arch.cr2 = addr;
293 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 342 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
294} 343}
@@ -301,11 +350,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
301 350
302void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 351void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
303{ 352{
304 WARN_ON(vcpu->arch.exception.pending); 353 kvm_multiple_exception(vcpu, nr, true, error_code);
305 vcpu->arch.exception.pending = true;
306 vcpu->arch.exception.has_error_code = true;
307 vcpu->arch.exception.nr = nr;
308 vcpu->arch.exception.error_code = error_code;
309} 354}
310EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 355EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
311 356
@@ -383,12 +428,18 @@ out:
383 428
384void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 429void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
385{ 430{
386 if (cr0 & CR0_RESERVED_BITS) { 431 cr0 |= X86_CR0_ET;
432
433#ifdef CONFIG_X86_64
434 if (cr0 & 0xffffffff00000000UL) {
387 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 435 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
388 cr0, vcpu->arch.cr0); 436 cr0, kvm_read_cr0(vcpu));
389 kvm_inject_gp(vcpu, 0); 437 kvm_inject_gp(vcpu, 0);
390 return; 438 return;
391 } 439 }
440#endif
441
442 cr0 &= ~CR0_RESERVED_BITS;
392 443
393 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 444 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
394 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 445 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
@@ -405,7 +456,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
405 456
406 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 457 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
407#ifdef CONFIG_X86_64 458#ifdef CONFIG_X86_64
408 if ((vcpu->arch.shadow_efer & EFER_LME)) { 459 if ((vcpu->arch.efer & EFER_LME)) {
409 int cs_db, cs_l; 460 int cs_db, cs_l;
410 461
411 if (!is_pae(vcpu)) { 462 if (!is_pae(vcpu)) {
@@ -443,13 +494,13 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
443 494
444void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 495void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
445{ 496{
446 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 497 kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
447} 498}
448EXPORT_SYMBOL_GPL(kvm_lmsw); 499EXPORT_SYMBOL_GPL(kvm_lmsw);
449 500
450void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 501void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
451{ 502{
452 unsigned long old_cr4 = vcpu->arch.cr4; 503 unsigned long old_cr4 = kvm_read_cr4(vcpu);
453 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 504 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
454 505
455 if (cr4 & CR4_RESERVED_BITS) { 506 if (cr4 & CR4_RESERVED_BITS) {
@@ -575,9 +626,11 @@ static inline u32 bit(int bitno)
575 * kvm-specific. Those are put in the beginning of the list. 626 * kvm-specific. Those are put in the beginning of the list.
576 */ 627 */
577 628
578#define KVM_SAVE_MSRS_BEGIN 2 629#define KVM_SAVE_MSRS_BEGIN 5
579static u32 msrs_to_save[] = { 630static u32 msrs_to_save[] = {
580 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 631 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
632 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
633 HV_X64_MSR_APIC_ASSIST_PAGE,
581 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 634 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
582 MSR_K6_STAR, 635 MSR_K6_STAR,
583#ifdef CONFIG_X86_64 636#ifdef CONFIG_X86_64
@@ -602,7 +655,7 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
602 } 655 }
603 656
604 if (is_paging(vcpu) 657 if (is_paging(vcpu)
605 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 658 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
606 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); 659 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
607 kvm_inject_gp(vcpu, 0); 660 kvm_inject_gp(vcpu, 0);
608 return; 661 return;
@@ -633,9 +686,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
633 kvm_x86_ops->set_efer(vcpu, efer); 686 kvm_x86_ops->set_efer(vcpu, efer);
634 687
635 efer &= ~EFER_LMA; 688 efer &= ~EFER_LMA;
636 efer |= vcpu->arch.shadow_efer & EFER_LMA; 689 efer |= vcpu->arch.efer & EFER_LMA;
637 690
638 vcpu->arch.shadow_efer = efer; 691 vcpu->arch.efer = efer;
639 692
640 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 693 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
641 kvm_mmu_reset_context(vcpu); 694 kvm_mmu_reset_context(vcpu);
@@ -957,6 +1010,100 @@ out:
957 return r; 1010 return r;
958} 1011}
959 1012
1013static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
1014{
1015 return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
1016}
1017
1018static bool kvm_hv_msr_partition_wide(u32 msr)
1019{
1020 bool r = false;
1021 switch (msr) {
1022 case HV_X64_MSR_GUEST_OS_ID:
1023 case HV_X64_MSR_HYPERCALL:
1024 r = true;
1025 break;
1026 }
1027
1028 return r;
1029}
1030
1031static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1032{
1033 struct kvm *kvm = vcpu->kvm;
1034
1035 switch (msr) {
1036 case HV_X64_MSR_GUEST_OS_ID:
1037 kvm->arch.hv_guest_os_id = data;
1038 /* setting guest os id to zero disables hypercall page */
1039 if (!kvm->arch.hv_guest_os_id)
1040 kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
1041 break;
1042 case HV_X64_MSR_HYPERCALL: {
1043 u64 gfn;
1044 unsigned long addr;
1045 u8 instructions[4];
1046
1047 /* if guest os id is not set hypercall should remain disabled */
1048 if (!kvm->arch.hv_guest_os_id)
1049 break;
1050 if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
1051 kvm->arch.hv_hypercall = data;
1052 break;
1053 }
1054 gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
1055 addr = gfn_to_hva(kvm, gfn);
1056 if (kvm_is_error_hva(addr))
1057 return 1;
1058 kvm_x86_ops->patch_hypercall(vcpu, instructions);
1059 ((unsigned char *)instructions)[3] = 0xc3; /* ret */
1060 if (copy_to_user((void __user *)addr, instructions, 4))
1061 return 1;
1062 kvm->arch.hv_hypercall = data;
1063 break;
1064 }
1065 default:
1066 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1067 "data 0x%llx\n", msr, data);
1068 return 1;
1069 }
1070 return 0;
1071}
1072
1073static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1074{
1075 switch (msr) {
1076 case HV_X64_MSR_APIC_ASSIST_PAGE: {
1077 unsigned long addr;
1078
1079 if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
1080 vcpu->arch.hv_vapic = data;
1081 break;
1082 }
1083 addr = gfn_to_hva(vcpu->kvm, data >>
1084 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
1085 if (kvm_is_error_hva(addr))
1086 return 1;
1087 if (clear_user((void __user *)addr, PAGE_SIZE))
1088 return 1;
1089 vcpu->arch.hv_vapic = data;
1090 break;
1091 }
1092 case HV_X64_MSR_EOI:
1093 return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
1094 case HV_X64_MSR_ICR:
1095 return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
1096 case HV_X64_MSR_TPR:
1097 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
1098 default:
1099 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1100 "data 0x%llx\n", msr, data);
1101 return 1;
1102 }
1103
1104 return 0;
1105}
1106
960int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1107int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
961{ 1108{
962 switch (msr) { 1109 switch (msr) {
@@ -1071,6 +1218,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1071 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1218 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1072 "0x%x data 0x%llx\n", msr, data); 1219 "0x%x data 0x%llx\n", msr, data);
1073 break; 1220 break;
1221 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1222 if (kvm_hv_msr_partition_wide(msr)) {
1223 int r;
1224 mutex_lock(&vcpu->kvm->lock);
1225 r = set_msr_hyperv_pw(vcpu, msr, data);
1226 mutex_unlock(&vcpu->kvm->lock);
1227 return r;
1228 } else
1229 return set_msr_hyperv(vcpu, msr, data);
1230 break;
1074 default: 1231 default:
1075 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1232 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1076 return xen_hvm_config(vcpu, data); 1233 return xen_hvm_config(vcpu, data);
@@ -1170,6 +1327,54 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1170 return 0; 1327 return 0;
1171} 1328}
1172 1329
1330static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1331{
1332 u64 data = 0;
1333 struct kvm *kvm = vcpu->kvm;
1334
1335 switch (msr) {
1336 case HV_X64_MSR_GUEST_OS_ID:
1337 data = kvm->arch.hv_guest_os_id;
1338 break;
1339 case HV_X64_MSR_HYPERCALL:
1340 data = kvm->arch.hv_hypercall;
1341 break;
1342 default:
1343 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1344 return 1;
1345 }
1346
1347 *pdata = data;
1348 return 0;
1349}
1350
1351static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1352{
1353 u64 data = 0;
1354
1355 switch (msr) {
1356 case HV_X64_MSR_VP_INDEX: {
1357 int r;
1358 struct kvm_vcpu *v;
1359 kvm_for_each_vcpu(r, v, vcpu->kvm)
1360 if (v == vcpu)
1361 data = r;
1362 break;
1363 }
1364 case HV_X64_MSR_EOI:
1365 return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
1366 case HV_X64_MSR_ICR:
1367 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
1368 case HV_X64_MSR_TPR:
1369 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
1370 default:
1371 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1372 return 1;
1373 }
1374 *pdata = data;
1375 return 0;
1376}
1377
1173int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1378int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1174{ 1379{
1175 u64 data; 1380 u64 data;
@@ -1221,7 +1426,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1221 data |= (((uint64_t)4ULL) << 40); 1426 data |= (((uint64_t)4ULL) << 40);
1222 break; 1427 break;
1223 case MSR_EFER: 1428 case MSR_EFER:
1224 data = vcpu->arch.shadow_efer; 1429 data = vcpu->arch.efer;
1225 break; 1430 break;
1226 case MSR_KVM_WALL_CLOCK: 1431 case MSR_KVM_WALL_CLOCK:
1227 data = vcpu->kvm->arch.wall_clock; 1432 data = vcpu->kvm->arch.wall_clock;
@@ -1236,6 +1441,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1236 case MSR_IA32_MCG_STATUS: 1441 case MSR_IA32_MCG_STATUS:
1237 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1442 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1238 return get_msr_mce(vcpu, msr, pdata); 1443 return get_msr_mce(vcpu, msr, pdata);
1444 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1445 if (kvm_hv_msr_partition_wide(msr)) {
1446 int r;
1447 mutex_lock(&vcpu->kvm->lock);
1448 r = get_msr_hyperv_pw(vcpu, msr, pdata);
1449 mutex_unlock(&vcpu->kvm->lock);
1450 return r;
1451 } else
1452 return get_msr_hyperv(vcpu, msr, pdata);
1453 break;
1239 default: 1454 default:
1240 if (!ignore_msrs) { 1455 if (!ignore_msrs) {
1241 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1456 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -1261,15 +1476,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1261 int (*do_msr)(struct kvm_vcpu *vcpu, 1476 int (*do_msr)(struct kvm_vcpu *vcpu,
1262 unsigned index, u64 *data)) 1477 unsigned index, u64 *data))
1263{ 1478{
1264 int i; 1479 int i, idx;
1265 1480
1266 vcpu_load(vcpu); 1481 vcpu_load(vcpu);
1267 1482
1268 down_read(&vcpu->kvm->slots_lock); 1483 idx = srcu_read_lock(&vcpu->kvm->srcu);
1269 for (i = 0; i < msrs->nmsrs; ++i) 1484 for (i = 0; i < msrs->nmsrs; ++i)
1270 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1485 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1271 break; 1486 break;
1272 up_read(&vcpu->kvm->slots_lock); 1487 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1273 1488
1274 vcpu_put(vcpu); 1489 vcpu_put(vcpu);
1275 1490
@@ -1351,6 +1566,11 @@ int kvm_dev_ioctl_check_extension(long ext)
1351 case KVM_CAP_XEN_HVM: 1566 case KVM_CAP_XEN_HVM:
1352 case KVM_CAP_ADJUST_CLOCK: 1567 case KVM_CAP_ADJUST_CLOCK:
1353 case KVM_CAP_VCPU_EVENTS: 1568 case KVM_CAP_VCPU_EVENTS:
1569 case KVM_CAP_HYPERV:
1570 case KVM_CAP_HYPERV_VAPIC:
1571 case KVM_CAP_HYPERV_SPIN:
1572 case KVM_CAP_PCI_SEGMENT:
1573 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1354 r = 1; 1574 r = 1;
1355 break; 1575 break;
1356 case KVM_CAP_COALESCED_MMIO: 1576 case KVM_CAP_COALESCED_MMIO:
@@ -1464,8 +1684,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1464 1684
1465void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1685void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1466{ 1686{
1467 kvm_x86_ops->vcpu_put(vcpu);
1468 kvm_put_guest_fpu(vcpu); 1687 kvm_put_guest_fpu(vcpu);
1688 kvm_x86_ops->vcpu_put(vcpu);
1469} 1689}
1470 1690
1471static int is_efer_nx(void) 1691static int is_efer_nx(void)
@@ -1530,6 +1750,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1530 cpuid_fix_nx_cap(vcpu); 1750 cpuid_fix_nx_cap(vcpu);
1531 r = 0; 1751 r = 0;
1532 kvm_apic_set_version(vcpu); 1752 kvm_apic_set_version(vcpu);
1753 kvm_x86_ops->cpuid_update(vcpu);
1533 1754
1534out_free: 1755out_free:
1535 vfree(cpuid_entries); 1756 vfree(cpuid_entries);
@@ -1552,6 +1773,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1552 goto out; 1773 goto out;
1553 vcpu->arch.cpuid_nent = cpuid->nent; 1774 vcpu->arch.cpuid_nent = cpuid->nent;
1554 kvm_apic_set_version(vcpu); 1775 kvm_apic_set_version(vcpu);
1776 kvm_x86_ops->cpuid_update(vcpu);
1555 return 0; 1777 return 0;
1556 1778
1557out: 1779out:
@@ -1594,12 +1816,15 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1594 u32 index, int *nent, int maxnent) 1816 u32 index, int *nent, int maxnent)
1595{ 1817{
1596 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 1818 unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1597 unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
1598#ifdef CONFIG_X86_64 1819#ifdef CONFIG_X86_64
1820 unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
1821 ? F(GBPAGES) : 0;
1599 unsigned f_lm = F(LM); 1822 unsigned f_lm = F(LM);
1600#else 1823#else
1824 unsigned f_gbpages = 0;
1601 unsigned f_lm = 0; 1825 unsigned f_lm = 0;
1602#endif 1826#endif
1827 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
1603 1828
1604 /* cpuid 1.edx */ 1829 /* cpuid 1.edx */
1605 const u32 kvm_supported_word0_x86_features = 1830 const u32 kvm_supported_word0_x86_features =
@@ -1619,7 +1844,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1619 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1844 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1620 F(PAT) | F(PSE36) | 0 /* Reserved */ | 1845 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1621 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 1846 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1622 F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | 1847 F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
1623 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1848 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1624 /* cpuid 1.ecx */ 1849 /* cpuid 1.ecx */
1625 const u32 kvm_supported_word4_x86_features = 1850 const u32 kvm_supported_word4_x86_features =
@@ -1866,7 +2091,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1866 return 0; 2091 return 0;
1867 if (mce->status & MCI_STATUS_UC) { 2092 if (mce->status & MCI_STATUS_UC) {
1868 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 2093 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
1869 !(vcpu->arch.cr4 & X86_CR4_MCE)) { 2094 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
1870 printk(KERN_DEBUG "kvm: set_mce: " 2095 printk(KERN_DEBUG "kvm: set_mce: "
1871 "injects mce exception while " 2096 "injects mce exception while "
1872 "previous one is in progress!\n"); 2097 "previous one is in progress!\n");
@@ -2160,14 +2385,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
2160 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 2385 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
2161 return -EINVAL; 2386 return -EINVAL;
2162 2387
2163 down_write(&kvm->slots_lock); 2388 mutex_lock(&kvm->slots_lock);
2164 spin_lock(&kvm->mmu_lock); 2389 spin_lock(&kvm->mmu_lock);
2165 2390
2166 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 2391 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
2167 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 2392 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
2168 2393
2169 spin_unlock(&kvm->mmu_lock); 2394 spin_unlock(&kvm->mmu_lock);
2170 up_write(&kvm->slots_lock); 2395 mutex_unlock(&kvm->slots_lock);
2171 return 0; 2396 return 0;
2172} 2397}
2173 2398
@@ -2176,13 +2401,35 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2176 return kvm->arch.n_alloc_mmu_pages; 2401 return kvm->arch.n_alloc_mmu_pages;
2177} 2402}
2178 2403
2404gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2405{
2406 int i;
2407 struct kvm_mem_alias *alias;
2408 struct kvm_mem_aliases *aliases;
2409
2410 aliases = rcu_dereference(kvm->arch.aliases);
2411
2412 for (i = 0; i < aliases->naliases; ++i) {
2413 alias = &aliases->aliases[i];
2414 if (alias->flags & KVM_ALIAS_INVALID)
2415 continue;
2416 if (gfn >= alias->base_gfn
2417 && gfn < alias->base_gfn + alias->npages)
2418 return alias->target_gfn + gfn - alias->base_gfn;
2419 }
2420 return gfn;
2421}
2422
2179gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 2423gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2180{ 2424{
2181 int i; 2425 int i;
2182 struct kvm_mem_alias *alias; 2426 struct kvm_mem_alias *alias;
2427 struct kvm_mem_aliases *aliases;
2183 2428
2184 for (i = 0; i < kvm->arch.naliases; ++i) { 2429 aliases = rcu_dereference(kvm->arch.aliases);
2185 alias = &kvm->arch.aliases[i]; 2430
2431 for (i = 0; i < aliases->naliases; ++i) {
2432 alias = &aliases->aliases[i];
2186 if (gfn >= alias->base_gfn 2433 if (gfn >= alias->base_gfn
2187 && gfn < alias->base_gfn + alias->npages) 2434 && gfn < alias->base_gfn + alias->npages)
2188 return alias->target_gfn + gfn - alias->base_gfn; 2435 return alias->target_gfn + gfn - alias->base_gfn;
@@ -2200,6 +2447,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
2200{ 2447{
2201 int r, n; 2448 int r, n;
2202 struct kvm_mem_alias *p; 2449 struct kvm_mem_alias *p;
2450 struct kvm_mem_aliases *aliases, *old_aliases;
2203 2451
2204 r = -EINVAL; 2452 r = -EINVAL;
2205 /* General sanity checks */ 2453 /* General sanity checks */
@@ -2216,26 +2464,48 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
2216 < alias->target_phys_addr) 2464 < alias->target_phys_addr)
2217 goto out; 2465 goto out;
2218 2466
2219 down_write(&kvm->slots_lock); 2467 r = -ENOMEM;
2220 spin_lock(&kvm->mmu_lock); 2468 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2469 if (!aliases)
2470 goto out;
2471
2472 mutex_lock(&kvm->slots_lock);
2221 2473
2222 p = &kvm->arch.aliases[alias->slot]; 2474 /* invalidate any gfn reference in case of deletion/shrinking */
2475 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2476 aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
2477 old_aliases = kvm->arch.aliases;
2478 rcu_assign_pointer(kvm->arch.aliases, aliases);
2479 synchronize_srcu_expedited(&kvm->srcu);
2480 kvm_mmu_zap_all(kvm);
2481 kfree(old_aliases);
2482
2483 r = -ENOMEM;
2484 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2485 if (!aliases)
2486 goto out_unlock;
2487
2488 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2489
2490 p = &aliases->aliases[alias->slot];
2223 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 2491 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
2224 p->npages = alias->memory_size >> PAGE_SHIFT; 2492 p->npages = alias->memory_size >> PAGE_SHIFT;
2225 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 2493 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
2494 p->flags &= ~(KVM_ALIAS_INVALID);
2226 2495
2227 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 2496 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
2228 if (kvm->arch.aliases[n - 1].npages) 2497 if (aliases->aliases[n - 1].npages)
2229 break; 2498 break;
2230 kvm->arch.naliases = n; 2499 aliases->naliases = n;
2231 2500
2232 spin_unlock(&kvm->mmu_lock); 2501 old_aliases = kvm->arch.aliases;
2233 kvm_mmu_zap_all(kvm); 2502 rcu_assign_pointer(kvm->arch.aliases, aliases);
2234 2503 synchronize_srcu_expedited(&kvm->srcu);
2235 up_write(&kvm->slots_lock); 2504 kfree(old_aliases);
2236 2505 r = 0;
2237 return 0;
2238 2506
2507out_unlock:
2508 mutex_unlock(&kvm->slots_lock);
2239out: 2509out:
2240 return r; 2510 return r;
2241} 2511}
@@ -2273,18 +2543,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2273 r = 0; 2543 r = 0;
2274 switch (chip->chip_id) { 2544 switch (chip->chip_id) {
2275 case KVM_IRQCHIP_PIC_MASTER: 2545 case KVM_IRQCHIP_PIC_MASTER:
2276 spin_lock(&pic_irqchip(kvm)->lock); 2546 raw_spin_lock(&pic_irqchip(kvm)->lock);
2277 memcpy(&pic_irqchip(kvm)->pics[0], 2547 memcpy(&pic_irqchip(kvm)->pics[0],
2278 &chip->chip.pic, 2548 &chip->chip.pic,
2279 sizeof(struct kvm_pic_state)); 2549 sizeof(struct kvm_pic_state));
2280 spin_unlock(&pic_irqchip(kvm)->lock); 2550 raw_spin_unlock(&pic_irqchip(kvm)->lock);
2281 break; 2551 break;
2282 case KVM_IRQCHIP_PIC_SLAVE: 2552 case KVM_IRQCHIP_PIC_SLAVE:
2283 spin_lock(&pic_irqchip(kvm)->lock); 2553 raw_spin_lock(&pic_irqchip(kvm)->lock);
2284 memcpy(&pic_irqchip(kvm)->pics[1], 2554 memcpy(&pic_irqchip(kvm)->pics[1],
2285 &chip->chip.pic, 2555 &chip->chip.pic,
2286 sizeof(struct kvm_pic_state)); 2556 sizeof(struct kvm_pic_state));
2287 spin_unlock(&pic_irqchip(kvm)->lock); 2557 raw_spin_unlock(&pic_irqchip(kvm)->lock);
2288 break; 2558 break;
2289 case KVM_IRQCHIP_IOAPIC: 2559 case KVM_IRQCHIP_IOAPIC:
2290 r = kvm_set_ioapic(kvm, &chip->chip.ioapic); 2560 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -2364,29 +2634,62 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
2364int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 2634int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2365 struct kvm_dirty_log *log) 2635 struct kvm_dirty_log *log)
2366{ 2636{
2367 int r; 2637 int r, n, i;
2368 int n;
2369 struct kvm_memory_slot *memslot; 2638 struct kvm_memory_slot *memslot;
2370 int is_dirty = 0; 2639 unsigned long is_dirty = 0;
2640 unsigned long *dirty_bitmap = NULL;
2371 2641
2372 down_write(&kvm->slots_lock); 2642 mutex_lock(&kvm->slots_lock);
2373 2643
2374 r = kvm_get_dirty_log(kvm, log, &is_dirty); 2644 r = -EINVAL;
2375 if (r) 2645 if (log->slot >= KVM_MEMORY_SLOTS)
2646 goto out;
2647
2648 memslot = &kvm->memslots->memslots[log->slot];
2649 r = -ENOENT;
2650 if (!memslot->dirty_bitmap)
2651 goto out;
2652
2653 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
2654
2655 r = -ENOMEM;
2656 dirty_bitmap = vmalloc(n);
2657 if (!dirty_bitmap)
2376 goto out; 2658 goto out;
2659 memset(dirty_bitmap, 0, n);
2660
2661 for (i = 0; !is_dirty && i < n/sizeof(long); i++)
2662 is_dirty = memslot->dirty_bitmap[i];
2377 2663
2378 /* If nothing is dirty, don't bother messing with page tables. */ 2664 /* If nothing is dirty, don't bother messing with page tables. */
2379 if (is_dirty) { 2665 if (is_dirty) {
2666 struct kvm_memslots *slots, *old_slots;
2667
2380 spin_lock(&kvm->mmu_lock); 2668 spin_lock(&kvm->mmu_lock);
2381 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2669 kvm_mmu_slot_remove_write_access(kvm, log->slot);
2382 spin_unlock(&kvm->mmu_lock); 2670 spin_unlock(&kvm->mmu_lock);
2383 memslot = &kvm->memslots[log->slot]; 2671
2384 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 2672 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
2385 memset(memslot->dirty_bitmap, 0, n); 2673 if (!slots)
2674 goto out_free;
2675
2676 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
2677 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
2678
2679 old_slots = kvm->memslots;
2680 rcu_assign_pointer(kvm->memslots, slots);
2681 synchronize_srcu_expedited(&kvm->srcu);
2682 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
2683 kfree(old_slots);
2386 } 2684 }
2685
2387 r = 0; 2686 r = 0;
2687 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
2688 r = -EFAULT;
2689out_free:
2690 vfree(dirty_bitmap);
2388out: 2691out:
2389 up_write(&kvm->slots_lock); 2692 mutex_unlock(&kvm->slots_lock);
2390 return r; 2693 return r;
2391} 2694}
2392 2695
@@ -2469,6 +2772,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
2469 if (vpic) { 2772 if (vpic) {
2470 r = kvm_ioapic_init(kvm); 2773 r = kvm_ioapic_init(kvm);
2471 if (r) { 2774 if (r) {
2775 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
2776 &vpic->dev);
2472 kfree(vpic); 2777 kfree(vpic);
2473 goto create_irqchip_unlock; 2778 goto create_irqchip_unlock;
2474 } 2779 }
@@ -2480,10 +2785,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
2480 r = kvm_setup_default_irq_routing(kvm); 2785 r = kvm_setup_default_irq_routing(kvm);
2481 if (r) { 2786 if (r) {
2482 mutex_lock(&kvm->irq_lock); 2787 mutex_lock(&kvm->irq_lock);
2483 kfree(kvm->arch.vpic); 2788 kvm_ioapic_destroy(kvm);
2484 kfree(kvm->arch.vioapic); 2789 kvm_destroy_pic(kvm);
2485 kvm->arch.vpic = NULL;
2486 kvm->arch.vioapic = NULL;
2487 mutex_unlock(&kvm->irq_lock); 2790 mutex_unlock(&kvm->irq_lock);
2488 } 2791 }
2489 create_irqchip_unlock: 2792 create_irqchip_unlock:
@@ -2499,7 +2802,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2499 sizeof(struct kvm_pit_config))) 2802 sizeof(struct kvm_pit_config)))
2500 goto out; 2803 goto out;
2501 create_pit: 2804 create_pit:
2502 down_write(&kvm->slots_lock); 2805 mutex_lock(&kvm->slots_lock);
2503 r = -EEXIST; 2806 r = -EEXIST;
2504 if (kvm->arch.vpit) 2807 if (kvm->arch.vpit)
2505 goto create_pit_unlock; 2808 goto create_pit_unlock;
@@ -2508,7 +2811,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2508 if (kvm->arch.vpit) 2811 if (kvm->arch.vpit)
2509 r = 0; 2812 r = 0;
2510 create_pit_unlock: 2813 create_pit_unlock:
2511 up_write(&kvm->slots_lock); 2814 mutex_unlock(&kvm->slots_lock);
2512 break; 2815 break;
2513 case KVM_IRQ_LINE_STATUS: 2816 case KVM_IRQ_LINE_STATUS:
2514 case KVM_IRQ_LINE: { 2817 case KVM_IRQ_LINE: {
@@ -2725,7 +3028,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
2725 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) 3028 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
2726 return 0; 3029 return 0;
2727 3030
2728 return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); 3031 return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
2729} 3032}
2730 3033
2731static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 3034static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
@@ -2734,17 +3037,44 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
2734 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) 3037 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
2735 return 0; 3038 return 0;
2736 3039
2737 return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); 3040 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
2738} 3041}
2739 3042
2740static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 3043gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
2741 struct kvm_vcpu *vcpu) 3044{
3045 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3046 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3047}
3048
3049 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3050{
3051 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3052 access |= PFERR_FETCH_MASK;
3053 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3054}
3055
3056gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3057{
3058 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3059 access |= PFERR_WRITE_MASK;
3060 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3061}
3062
3063/* uses this to access any guest's mapped memory without checking CPL */
3064gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3065{
3066 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
3067}
3068
3069static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3070 struct kvm_vcpu *vcpu, u32 access,
3071 u32 *error)
2742{ 3072{
2743 void *data = val; 3073 void *data = val;
2744 int r = X86EMUL_CONTINUE; 3074 int r = X86EMUL_CONTINUE;
2745 3075
2746 while (bytes) { 3076 while (bytes) {
2747 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3077 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
2748 unsigned offset = addr & (PAGE_SIZE-1); 3078 unsigned offset = addr & (PAGE_SIZE-1);
2749 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3079 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
2750 int ret; 3080 int ret;
@@ -2767,14 +3097,37 @@ out:
2767 return r; 3097 return r;
2768} 3098}
2769 3099
3100/* used for instruction fetching */
3101static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
3102 struct kvm_vcpu *vcpu, u32 *error)
3103{
3104 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3105 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3106 access | PFERR_FETCH_MASK, error);
3107}
3108
3109static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
3110 struct kvm_vcpu *vcpu, u32 *error)
3111{
3112 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3113 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3114 error);
3115}
3116
3117static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
3118 struct kvm_vcpu *vcpu, u32 *error)
3119{
3120 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
3121}
3122
2770static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 3123static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2771 struct kvm_vcpu *vcpu) 3124 struct kvm_vcpu *vcpu, u32 *error)
2772{ 3125{
2773 void *data = val; 3126 void *data = val;
2774 int r = X86EMUL_CONTINUE; 3127 int r = X86EMUL_CONTINUE;
2775 3128
2776 while (bytes) { 3129 while (bytes) {
2777 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3130 gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
2778 unsigned offset = addr & (PAGE_SIZE-1); 3131 unsigned offset = addr & (PAGE_SIZE-1);
2779 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3132 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
2780 int ret; 3133 int ret;
@@ -2804,6 +3157,7 @@ static int emulator_read_emulated(unsigned long addr,
2804 struct kvm_vcpu *vcpu) 3157 struct kvm_vcpu *vcpu)
2805{ 3158{
2806 gpa_t gpa; 3159 gpa_t gpa;
3160 u32 error_code;
2807 3161
2808 if (vcpu->mmio_read_completed) { 3162 if (vcpu->mmio_read_completed) {
2809 memcpy(val, vcpu->mmio_data, bytes); 3163 memcpy(val, vcpu->mmio_data, bytes);
@@ -2813,17 +3167,20 @@ static int emulator_read_emulated(unsigned long addr,
2813 return X86EMUL_CONTINUE; 3167 return X86EMUL_CONTINUE;
2814 } 3168 }
2815 3169
2816 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3170 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
3171
3172 if (gpa == UNMAPPED_GVA) {
3173 kvm_inject_page_fault(vcpu, addr, error_code);
3174 return X86EMUL_PROPAGATE_FAULT;
3175 }
2817 3176
2818 /* For APIC access vmexit */ 3177 /* For APIC access vmexit */
2819 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3178 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2820 goto mmio; 3179 goto mmio;
2821 3180
2822 if (kvm_read_guest_virt(addr, val, bytes, vcpu) 3181 if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
2823 == X86EMUL_CONTINUE) 3182 == X86EMUL_CONTINUE)
2824 return X86EMUL_CONTINUE; 3183 return X86EMUL_CONTINUE;
2825 if (gpa == UNMAPPED_GVA)
2826 return X86EMUL_PROPAGATE_FAULT;
2827 3184
2828mmio: 3185mmio:
2829 /* 3186 /*
@@ -2862,11 +3219,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,
2862 struct kvm_vcpu *vcpu) 3219 struct kvm_vcpu *vcpu)
2863{ 3220{
2864 gpa_t gpa; 3221 gpa_t gpa;
3222 u32 error_code;
2865 3223
2866 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3224 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
2867 3225
2868 if (gpa == UNMAPPED_GVA) { 3226 if (gpa == UNMAPPED_GVA) {
2869 kvm_inject_page_fault(vcpu, addr, 2); 3227 kvm_inject_page_fault(vcpu, addr, error_code);
2870 return X86EMUL_PROPAGATE_FAULT; 3228 return X86EMUL_PROPAGATE_FAULT;
2871 } 3229 }
2872 3230
@@ -2930,7 +3288,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
2930 char *kaddr; 3288 char *kaddr;
2931 u64 val; 3289 u64 val;
2932 3290
2933 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3291 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
2934 3292
2935 if (gpa == UNMAPPED_GVA || 3293 if (gpa == UNMAPPED_GVA ||
2936 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3294 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -2967,35 +3325,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2967 3325
2968int emulate_clts(struct kvm_vcpu *vcpu) 3326int emulate_clts(struct kvm_vcpu *vcpu)
2969{ 3327{
2970 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 3328 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
3329 kvm_x86_ops->fpu_activate(vcpu);
2971 return X86EMUL_CONTINUE; 3330 return X86EMUL_CONTINUE;
2972} 3331}
2973 3332
2974int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3333int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2975{ 3334{
2976 struct kvm_vcpu *vcpu = ctxt->vcpu; 3335 return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);
2977
2978 switch (dr) {
2979 case 0 ... 3:
2980 *dest = kvm_x86_ops->get_dr(vcpu, dr);
2981 return X86EMUL_CONTINUE;
2982 default:
2983 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2984 return X86EMUL_UNHANDLEABLE;
2985 }
2986} 3336}
2987 3337
2988int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3338int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2989{ 3339{
2990 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3340 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2991 int exception;
2992 3341
2993 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 3342 return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);
2994 if (exception) {
2995 /* FIXME: better handling */
2996 return X86EMUL_UNHANDLEABLE;
2997 }
2998 return X86EMUL_CONTINUE;
2999} 3343}
3000 3344
3001void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3345void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -3009,7 +3353,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
3009 3353
3010 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 3354 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
3011 3355
3012 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); 3356 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
3013 3357
3014 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 3358 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
3015 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 3359 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
@@ -3017,7 +3361,8 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
3017EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 3361EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
3018 3362
3019static struct x86_emulate_ops emulate_ops = { 3363static struct x86_emulate_ops emulate_ops = {
3020 .read_std = kvm_read_guest_virt, 3364 .read_std = kvm_read_guest_virt_system,
3365 .fetch = kvm_fetch_guest_virt,
3021 .read_emulated = emulator_read_emulated, 3366 .read_emulated = emulator_read_emulated,
3022 .write_emulated = emulator_write_emulated, 3367 .write_emulated = emulator_write_emulated,
3023 .cmpxchg_emulated = emulator_cmpxchg_emulated, 3368 .cmpxchg_emulated = emulator_cmpxchg_emulated,
@@ -3060,8 +3405,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3060 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3405 vcpu->arch.emulate_ctxt.vcpu = vcpu;
3061 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); 3406 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
3062 vcpu->arch.emulate_ctxt.mode = 3407 vcpu->arch.emulate_ctxt.mode =
3408 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
3063 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3409 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
3064 ? X86EMUL_MODE_REAL : cs_l 3410 ? X86EMUL_MODE_VM86 : cs_l
3065 ? X86EMUL_MODE_PROT64 : cs_db 3411 ? X86EMUL_MODE_PROT64 : cs_db
3066 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3412 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3067 3413
@@ -3153,12 +3499,17 @@ static int pio_copy_data(struct kvm_vcpu *vcpu)
3153 gva_t q = vcpu->arch.pio.guest_gva; 3499 gva_t q = vcpu->arch.pio.guest_gva;
3154 unsigned bytes; 3500 unsigned bytes;
3155 int ret; 3501 int ret;
3502 u32 error_code;
3156 3503
3157 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 3504 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
3158 if (vcpu->arch.pio.in) 3505 if (vcpu->arch.pio.in)
3159 ret = kvm_write_guest_virt(q, p, bytes, vcpu); 3506 ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
3160 else 3507 else
3161 ret = kvm_read_guest_virt(q, p, bytes, vcpu); 3508 ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
3509
3510 if (ret == X86EMUL_PROPAGATE_FAULT)
3511 kvm_inject_page_fault(vcpu, q, error_code);
3512
3162 return ret; 3513 return ret;
3163} 3514}
3164 3515
@@ -3179,7 +3530,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
3179 if (io->in) { 3530 if (io->in) {
3180 r = pio_copy_data(vcpu); 3531 r = pio_copy_data(vcpu);
3181 if (r) 3532 if (r)
3182 return r; 3533 goto out;
3183 } 3534 }
3184 3535
3185 delta = 1; 3536 delta = 1;
@@ -3206,7 +3557,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
3206 kvm_register_write(vcpu, VCPU_REGS_RSI, val); 3557 kvm_register_write(vcpu, VCPU_REGS_RSI, val);
3207 } 3558 }
3208 } 3559 }
3209 3560out:
3210 io->count -= io->cur_count; 3561 io->count -= io->cur_count;
3211 io->cur_count = 0; 3562 io->cur_count = 0;
3212 3563
@@ -3219,11 +3570,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3219 int r; 3570 int r;
3220 3571
3221 if (vcpu->arch.pio.in) 3572 if (vcpu->arch.pio.in)
3222 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 3573 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
3223 vcpu->arch.pio.size, pd); 3574 vcpu->arch.pio.size, pd);
3224 else 3575 else
3225 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 3576 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3226 vcpu->arch.pio.size, pd); 3577 vcpu->arch.pio.port, vcpu->arch.pio.size,
3578 pd);
3227 return r; 3579 return r;
3228} 3580}
3229 3581
@@ -3234,7 +3586,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
3234 int i, r = 0; 3586 int i, r = 0;
3235 3587
3236 for (i = 0; i < io->cur_count; i++) { 3588 for (i = 0; i < io->cur_count; i++) {
3237 if (kvm_io_bus_write(&vcpu->kvm->pio_bus, 3589 if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3238 io->port, io->size, pd)) { 3590 io->port, io->size, pd)) {
3239 r = -EOPNOTSUPP; 3591 r = -EOPNOTSUPP;
3240 break; 3592 break;
@@ -3248,6 +3600,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
3248{ 3600{
3249 unsigned long val; 3601 unsigned long val;
3250 3602
3603 trace_kvm_pio(!in, port, size, 1);
3604
3251 vcpu->run->exit_reason = KVM_EXIT_IO; 3605 vcpu->run->exit_reason = KVM_EXIT_IO;
3252 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3606 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3253 vcpu->run->io.size = vcpu->arch.pio.size = size; 3607 vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -3259,11 +3613,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
3259 vcpu->arch.pio.down = 0; 3613 vcpu->arch.pio.down = 0;
3260 vcpu->arch.pio.rep = 0; 3614 vcpu->arch.pio.rep = 0;
3261 3615
3262 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, 3616 if (!vcpu->arch.pio.in) {
3263 size, 1); 3617 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3264 3618 memcpy(vcpu->arch.pio_data, &val, 4);
3265 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 3619 }
3266 memcpy(vcpu->arch.pio_data, &val, 4);
3267 3620
3268 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 3621 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3269 complete_pio(vcpu); 3622 complete_pio(vcpu);
@@ -3280,6 +3633,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3280 unsigned now, in_page; 3633 unsigned now, in_page;
3281 int ret = 0; 3634 int ret = 0;
3282 3635
3636 trace_kvm_pio(!in, port, size, count);
3637
3283 vcpu->run->exit_reason = KVM_EXIT_IO; 3638 vcpu->run->exit_reason = KVM_EXIT_IO;
3284 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3639 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3285 vcpu->run->io.size = vcpu->arch.pio.size = size; 3640 vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -3291,9 +3646,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3291 vcpu->arch.pio.down = down; 3646 vcpu->arch.pio.down = down;
3292 vcpu->arch.pio.rep = rep; 3647 vcpu->arch.pio.rep = rep;
3293 3648
3294 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
3295 size, count);
3296
3297 if (!count) { 3649 if (!count) {
3298 kvm_x86_ops->skip_emulated_instruction(vcpu); 3650 kvm_x86_ops->skip_emulated_instruction(vcpu);
3299 return 1; 3651 return 1;
@@ -3325,10 +3677,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3325 if (!vcpu->arch.pio.in) { 3677 if (!vcpu->arch.pio.in) {
3326 /* string PIO write */ 3678 /* string PIO write */
3327 ret = pio_copy_data(vcpu); 3679 ret = pio_copy_data(vcpu);
3328 if (ret == X86EMUL_PROPAGATE_FAULT) { 3680 if (ret == X86EMUL_PROPAGATE_FAULT)
3329 kvm_inject_gp(vcpu, 0);
3330 return 1; 3681 return 1;
3331 }
3332 if (ret == 0 && !pio_string_write(vcpu)) { 3682 if (ret == 0 && !pio_string_write(vcpu)) {
3333 complete_pio(vcpu); 3683 complete_pio(vcpu);
3334 if (vcpu->arch.pio.count == 0) 3684 if (vcpu->arch.pio.count == 0)
@@ -3487,11 +3837,76 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
3487 return a0 | ((gpa_t)a1 << 32); 3837 return a0 | ((gpa_t)a1 << 32);
3488} 3838}
3489 3839
3840int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
3841{
3842 u64 param, ingpa, outgpa, ret;
3843 uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
3844 bool fast, longmode;
3845 int cs_db, cs_l;
3846
3847 /*
3848 * hypercall generates UD from non zero cpl and real mode
3849 * per HYPER-V spec
3850 */
3851 if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
3852 kvm_queue_exception(vcpu, UD_VECTOR);
3853 return 0;
3854 }
3855
3856 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3857 longmode = is_long_mode(vcpu) && cs_l == 1;
3858
3859 if (!longmode) {
3860 param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
3861 (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
3862 ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
3863 (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
3864 outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
3865 (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
3866 }
3867#ifdef CONFIG_X86_64
3868 else {
3869 param = kvm_register_read(vcpu, VCPU_REGS_RCX);
3870 ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
3871 outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
3872 }
3873#endif
3874
3875 code = param & 0xffff;
3876 fast = (param >> 16) & 0x1;
3877 rep_cnt = (param >> 32) & 0xfff;
3878 rep_idx = (param >> 48) & 0xfff;
3879
3880 trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
3881
3882 switch (code) {
3883 case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
3884 kvm_vcpu_on_spin(vcpu);
3885 break;
3886 default:
3887 res = HV_STATUS_INVALID_HYPERCALL_CODE;
3888 break;
3889 }
3890
3891 ret = res | (((u64)rep_done & 0xfff) << 32);
3892 if (longmode) {
3893 kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
3894 } else {
3895 kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
3896 kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
3897 }
3898
3899 return 1;
3900}
3901
3490int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 3902int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
3491{ 3903{
3492 unsigned long nr, a0, a1, a2, a3, ret; 3904 unsigned long nr, a0, a1, a2, a3, ret;
3493 int r = 1; 3905 int r = 1;
3494 3906
3907 if (kvm_hv_hypercall_enabled(vcpu->kvm))
3908 return kvm_hv_hypercall(vcpu);
3909
3495 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 3910 nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
3496 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 3911 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
3497 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 3912 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -3534,10 +3949,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
3534int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 3949int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3535{ 3950{
3536 char instruction[3]; 3951 char instruction[3];
3537 int ret = 0;
3538 unsigned long rip = kvm_rip_read(vcpu); 3952 unsigned long rip = kvm_rip_read(vcpu);
3539 3953
3540
3541 /* 3954 /*
3542 * Blow out the MMU to ensure that no other VCPU has an active mapping 3955 * Blow out the MMU to ensure that no other VCPU has an active mapping
3543 * to ensure that the updated hypercall appears atomically across all 3956 * to ensure that the updated hypercall appears atomically across all
@@ -3546,11 +3959,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3546 kvm_mmu_zap_all(vcpu->kvm); 3959 kvm_mmu_zap_all(vcpu->kvm);
3547 3960
3548 kvm_x86_ops->patch_hypercall(vcpu, instruction); 3961 kvm_x86_ops->patch_hypercall(vcpu, instruction);
3549 if (emulator_write_emulated(rip, instruction, 3, vcpu)
3550 != X86EMUL_CONTINUE)
3551 ret = -EFAULT;
3552 3962
3553 return ret; 3963 return emulator_write_emulated(rip, instruction, 3, vcpu);
3554} 3964}
3555 3965
3556static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3966static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -3583,10 +3993,9 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
3583{ 3993{
3584 unsigned long value; 3994 unsigned long value;
3585 3995
3586 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3587 switch (cr) { 3996 switch (cr) {
3588 case 0: 3997 case 0:
3589 value = vcpu->arch.cr0; 3998 value = kvm_read_cr0(vcpu);
3590 break; 3999 break;
3591 case 2: 4000 case 2:
3592 value = vcpu->arch.cr2; 4001 value = vcpu->arch.cr2;
@@ -3595,7 +4004,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
3595 value = vcpu->arch.cr3; 4004 value = vcpu->arch.cr3;
3596 break; 4005 break;
3597 case 4: 4006 case 4:
3598 value = vcpu->arch.cr4; 4007 value = kvm_read_cr4(vcpu);
3599 break; 4008 break;
3600 case 8: 4009 case 8:
3601 value = kvm_get_cr8(vcpu); 4010 value = kvm_get_cr8(vcpu);
@@ -3613,7 +4022,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3613{ 4022{
3614 switch (cr) { 4023 switch (cr) {
3615 case 0: 4024 case 0:
3616 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 4025 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
3617 *rflags = kvm_get_rflags(vcpu); 4026 *rflags = kvm_get_rflags(vcpu);
3618 break; 4027 break;
3619 case 2: 4028 case 2:
@@ -3623,7 +4032,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3623 kvm_set_cr3(vcpu, val); 4032 kvm_set_cr3(vcpu, val);
3624 break; 4033 break;
3625 case 4: 4034 case 4:
3626 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 4035 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3627 break; 4036 break;
3628 case 8: 4037 case 8:
3629 kvm_set_cr8(vcpu, val & 0xfUL); 4038 kvm_set_cr8(vcpu, val & 0xfUL);
@@ -3690,6 +4099,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3690 } 4099 }
3691 return best; 4100 return best;
3692} 4101}
4102EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
3693 4103
3694int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) 4104int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3695{ 4105{
@@ -3773,14 +4183,15 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
3773static void vapic_exit(struct kvm_vcpu *vcpu) 4183static void vapic_exit(struct kvm_vcpu *vcpu)
3774{ 4184{
3775 struct kvm_lapic *apic = vcpu->arch.apic; 4185 struct kvm_lapic *apic = vcpu->arch.apic;
4186 int idx;
3776 4187
3777 if (!apic || !apic->vapic_addr) 4188 if (!apic || !apic->vapic_addr)
3778 return; 4189 return;
3779 4190
3780 down_read(&vcpu->kvm->slots_lock); 4191 idx = srcu_read_lock(&vcpu->kvm->srcu);
3781 kvm_release_page_dirty(apic->vapic_page); 4192 kvm_release_page_dirty(apic->vapic_page);
3782 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 4193 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3783 up_read(&vcpu->kvm->slots_lock); 4194 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3784} 4195}
3785 4196
3786static void update_cr8_intercept(struct kvm_vcpu *vcpu) 4197static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -3876,12 +4287,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3876 r = 0; 4287 r = 0;
3877 goto out; 4288 goto out;
3878 } 4289 }
4290 if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
4291 vcpu->fpu_active = 0;
4292 kvm_x86_ops->fpu_deactivate(vcpu);
4293 }
3879 } 4294 }
3880 4295
3881 preempt_disable(); 4296 preempt_disable();
3882 4297
3883 kvm_x86_ops->prepare_guest_switch(vcpu); 4298 kvm_x86_ops->prepare_guest_switch(vcpu);
3884 kvm_load_guest_fpu(vcpu); 4299 if (vcpu->fpu_active)
4300 kvm_load_guest_fpu(vcpu);
3885 4301
3886 local_irq_disable(); 4302 local_irq_disable();
3887 4303
@@ -3909,7 +4325,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3909 kvm_lapic_sync_to_vapic(vcpu); 4325 kvm_lapic_sync_to_vapic(vcpu);
3910 } 4326 }
3911 4327
3912 up_read(&vcpu->kvm->slots_lock); 4328 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
3913 4329
3914 kvm_guest_enter(); 4330 kvm_guest_enter();
3915 4331
@@ -3951,7 +4367,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3951 4367
3952 preempt_enable(); 4368 preempt_enable();
3953 4369
3954 down_read(&vcpu->kvm->slots_lock); 4370 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
3955 4371
3956 /* 4372 /*
3957 * Profile KVM exit RIPs: 4373 * Profile KVM exit RIPs:
@@ -3973,6 +4389,7 @@ out:
3973static int __vcpu_run(struct kvm_vcpu *vcpu) 4389static int __vcpu_run(struct kvm_vcpu *vcpu)
3974{ 4390{
3975 int r; 4391 int r;
4392 struct kvm *kvm = vcpu->kvm;
3976 4393
3977 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 4394 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
3978 pr_debug("vcpu %d received sipi with vector # %x\n", 4395 pr_debug("vcpu %d received sipi with vector # %x\n",
@@ -3984,7 +4401,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
3984 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4401 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3985 } 4402 }
3986 4403
3987 down_read(&vcpu->kvm->slots_lock); 4404 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
3988 vapic_enter(vcpu); 4405 vapic_enter(vcpu);
3989 4406
3990 r = 1; 4407 r = 1;
@@ -3992,9 +4409,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
3992 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 4409 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3993 r = vcpu_enter_guest(vcpu); 4410 r = vcpu_enter_guest(vcpu);
3994 else { 4411 else {
3995 up_read(&vcpu->kvm->slots_lock); 4412 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
3996 kvm_vcpu_block(vcpu); 4413 kvm_vcpu_block(vcpu);
3997 down_read(&vcpu->kvm->slots_lock); 4414 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
3998 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 4415 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3999 { 4416 {
4000 switch(vcpu->arch.mp_state) { 4417 switch(vcpu->arch.mp_state) {
@@ -4029,13 +4446,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4029 ++vcpu->stat.signal_exits; 4446 ++vcpu->stat.signal_exits;
4030 } 4447 }
4031 if (need_resched()) { 4448 if (need_resched()) {
4032 up_read(&vcpu->kvm->slots_lock); 4449 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4033 kvm_resched(vcpu); 4450 kvm_resched(vcpu);
4034 down_read(&vcpu->kvm->slots_lock); 4451 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
4035 } 4452 }
4036 } 4453 }
4037 4454
4038 up_read(&vcpu->kvm->slots_lock); 4455 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4039 post_kvm_run_save(vcpu); 4456 post_kvm_run_save(vcpu);
4040 4457
4041 vapic_exit(vcpu); 4458 vapic_exit(vcpu);
@@ -4074,10 +4491,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4074 vcpu->mmio_read_completed = 1; 4491 vcpu->mmio_read_completed = 1;
4075 vcpu->mmio_needed = 0; 4492 vcpu->mmio_needed = 0;
4076 4493
4077 down_read(&vcpu->kvm->slots_lock); 4494 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4078 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, 4495 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
4079 EMULTYPE_NO_DECODE); 4496 EMULTYPE_NO_DECODE);
4080 up_read(&vcpu->kvm->slots_lock); 4497 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4081 if (r == EMULATE_DO_MMIO) { 4498 if (r == EMULATE_DO_MMIO) {
4082 /* 4499 /*
4083 * Read-modify-write. Back to userspace. 4500 * Read-modify-write. Back to userspace.
@@ -4204,13 +4621,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4204 sregs->gdt.limit = dt.limit; 4621 sregs->gdt.limit = dt.limit;
4205 sregs->gdt.base = dt.base; 4622 sregs->gdt.base = dt.base;
4206 4623
4207 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 4624 sregs->cr0 = kvm_read_cr0(vcpu);
4208 sregs->cr0 = vcpu->arch.cr0;
4209 sregs->cr2 = vcpu->arch.cr2; 4625 sregs->cr2 = vcpu->arch.cr2;
4210 sregs->cr3 = vcpu->arch.cr3; 4626 sregs->cr3 = vcpu->arch.cr3;
4211 sregs->cr4 = vcpu->arch.cr4; 4627 sregs->cr4 = kvm_read_cr4(vcpu);
4212 sregs->cr8 = kvm_get_cr8(vcpu); 4628 sregs->cr8 = kvm_get_cr8(vcpu);
4213 sregs->efer = vcpu->arch.shadow_efer; 4629 sregs->efer = vcpu->arch.efer;
4214 sregs->apic_base = kvm_get_apic_base(vcpu); 4630 sregs->apic_base = kvm_get_apic_base(vcpu);
4215 4631
4216 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); 4632 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
@@ -4298,14 +4714,23 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4298{ 4714{
4299 struct descriptor_table dtable; 4715 struct descriptor_table dtable;
4300 u16 index = selector >> 3; 4716 u16 index = selector >> 3;
4717 int ret;
4718 u32 err;
4719 gva_t addr;
4301 4720
4302 get_segment_descriptor_dtable(vcpu, selector, &dtable); 4721 get_segment_descriptor_dtable(vcpu, selector, &dtable);
4303 4722
4304 if (dtable.limit < index * 8 + 7) { 4723 if (dtable.limit < index * 8 + 7) {
4305 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 4724 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
4306 return 1; 4725 return X86EMUL_PROPAGATE_FAULT;
4307 } 4726 }
4308 return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4727 addr = dtable.base + index * 8;
4728 ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
4729 vcpu, &err);
4730 if (ret == X86EMUL_PROPAGATE_FAULT)
4731 kvm_inject_page_fault(vcpu, addr, err);
4732
4733 return ret;
4309} 4734}
4310 4735
4311/* allowed just for 8 bytes segments */ 4736/* allowed just for 8 bytes segments */
@@ -4319,15 +4744,23 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4319 4744
4320 if (dtable.limit < index * 8 + 7) 4745 if (dtable.limit < index * 8 + 7)
4321 return 1; 4746 return 1;
4322 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4747 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
4748}
4749
4750static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
4751 struct desc_struct *seg_desc)
4752{
4753 u32 base_addr = get_desc_base(seg_desc);
4754
4755 return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
4323} 4756}
4324 4757
4325static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, 4758static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
4326 struct desc_struct *seg_desc) 4759 struct desc_struct *seg_desc)
4327{ 4760{
4328 u32 base_addr = get_desc_base(seg_desc); 4761 u32 base_addr = get_desc_base(seg_desc);
4329 4762
4330 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 4763 return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
4331} 4764}
4332 4765
4333static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 4766static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
@@ -4338,18 +4771,6 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
4338 return kvm_seg.selector; 4771 return kvm_seg.selector;
4339} 4772}
4340 4773
4341static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
4342 u16 selector,
4343 struct kvm_segment *kvm_seg)
4344{
4345 struct desc_struct seg_desc;
4346
4347 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
4348 return 1;
4349 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
4350 return 0;
4351}
4352
4353static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) 4774static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
4354{ 4775{
4355 struct kvm_segment segvar = { 4776 struct kvm_segment segvar = {
@@ -4367,7 +4788,7 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se
4367 .unusable = 0, 4788 .unusable = 0,
4368 }; 4789 };
4369 kvm_x86_ops->set_segment(vcpu, &segvar, seg); 4790 kvm_x86_ops->set_segment(vcpu, &segvar, seg);
4370 return 0; 4791 return X86EMUL_CONTINUE;
4371} 4792}
4372 4793
4373static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) 4794static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
@@ -4377,24 +4798,112 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4377 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); 4798 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4378} 4799}
4379 4800
4380int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4801int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
4381 int type_bits, int seg)
4382{ 4802{
4383 struct kvm_segment kvm_seg; 4803 struct kvm_segment kvm_seg;
4804 struct desc_struct seg_desc;
4805 u8 dpl, rpl, cpl;
4806 unsigned err_vec = GP_VECTOR;
4807 u32 err_code = 0;
4808 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
4809 int ret;
4384 4810
4385 if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) 4811 if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
4386 return kvm_load_realmode_segment(vcpu, selector, seg); 4812 return kvm_load_realmode_segment(vcpu, selector, seg);
4387 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
4388 return 1;
4389 kvm_seg.type |= type_bits;
4390 4813
4391 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 4814 /* NULL selector is not valid for TR, CS and SS */
4392 seg != VCPU_SREG_LDTR) 4815 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
4393 if (!kvm_seg.s) 4816 && null_selector)
4394 kvm_seg.unusable = 1; 4817 goto exception;
4818
4819 /* TR should be in GDT only */
4820 if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
4821 goto exception;
4822
4823 ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
4824 if (ret)
4825 return ret;
4826
4827 seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
4828
4829 if (null_selector) { /* for NULL selector skip all following checks */
4830 kvm_seg.unusable = 1;
4831 goto load;
4832 }
4833
4834 err_code = selector & 0xfffc;
4835 err_vec = GP_VECTOR;
4395 4836
4837 /* can't load system descriptor into segment selecor */
4838 if (seg <= VCPU_SREG_GS && !kvm_seg.s)
4839 goto exception;
4840
4841 if (!kvm_seg.present) {
4842 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
4843 goto exception;
4844 }
4845
4846 rpl = selector & 3;
4847 dpl = kvm_seg.dpl;
4848 cpl = kvm_x86_ops->get_cpl(vcpu);
4849
4850 switch (seg) {
4851 case VCPU_SREG_SS:
4852 /*
4853 * segment is not a writable data segment or segment
4854 * selector's RPL != CPL or segment selector's RPL != CPL
4855 */
4856 if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
4857 goto exception;
4858 break;
4859 case VCPU_SREG_CS:
4860 if (!(kvm_seg.type & 8))
4861 goto exception;
4862
4863 if (kvm_seg.type & 4) {
4864 /* conforming */
4865 if (dpl > cpl)
4866 goto exception;
4867 } else {
4868 /* nonconforming */
4869 if (rpl > cpl || dpl != cpl)
4870 goto exception;
4871 }
4872 /* CS(RPL) <- CPL */
4873 selector = (selector & 0xfffc) | cpl;
4874 break;
4875 case VCPU_SREG_TR:
4876 if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
4877 goto exception;
4878 break;
4879 case VCPU_SREG_LDTR:
4880 if (kvm_seg.s || kvm_seg.type != 2)
4881 goto exception;
4882 break;
4883 default: /* DS, ES, FS, or GS */
4884 /*
4885 * segment is not a data or readable code segment or
4886 * ((segment is a data or nonconforming code segment)
4887 * and (both RPL and CPL > DPL))
4888 */
4889 if ((kvm_seg.type & 0xa) == 0x8 ||
4890 (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
4891 goto exception;
4892 break;
4893 }
4894
4895 if (!kvm_seg.unusable && kvm_seg.s) {
4896 /* mark segment as accessed */
4897 kvm_seg.type |= 1;
4898 seg_desc.type |= 1;
4899 save_guest_segment_descriptor(vcpu, selector, &seg_desc);
4900 }
4901load:
4396 kvm_set_segment(vcpu, &kvm_seg, seg); 4902 kvm_set_segment(vcpu, &kvm_seg, seg);
4397 return 0; 4903 return X86EMUL_CONTINUE;
4904exception:
4905 kvm_queue_exception_e(vcpu, err_vec, err_code);
4906 return X86EMUL_PROPAGATE_FAULT;
4398} 4907}
4399 4908
4400static void save_state_to_tss32(struct kvm_vcpu *vcpu, 4909static void save_state_to_tss32(struct kvm_vcpu *vcpu,
@@ -4420,6 +4929,14 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4420 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4929 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4421} 4930}
4422 4931
4932static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
4933{
4934 struct kvm_segment kvm_seg;
4935 kvm_get_segment(vcpu, &kvm_seg, seg);
4936 kvm_seg.selector = sel;
4937 kvm_set_segment(vcpu, &kvm_seg, seg);
4938}
4939
4423static int load_state_from_tss32(struct kvm_vcpu *vcpu, 4940static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4424 struct tss_segment_32 *tss) 4941 struct tss_segment_32 *tss)
4425{ 4942{
@@ -4437,25 +4954,41 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4437 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); 4954 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
4438 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); 4955 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
4439 4956
4440 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 4957 /*
4958 * SDM says that segment selectors are loaded before segment
4959 * descriptors
4960 */
4961 kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
4962 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
4963 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
4964 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
4965 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
4966 kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
4967 kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
4968
4969 /*
4970 * Now load segment descriptors. If fault happenes at this stage
4971 * it is handled in a context of new task
4972 */
4973 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
4441 return 1; 4974 return 1;
4442 4975
4443 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 4976 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
4444 return 1; 4977 return 1;
4445 4978
4446 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 4979 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
4447 return 1; 4980 return 1;
4448 4981
4449 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 4982 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
4450 return 1; 4983 return 1;
4451 4984
4452 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 4985 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
4453 return 1; 4986 return 1;
4454 4987
4455 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 4988 if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
4456 return 1; 4989 return 1;
4457 4990
4458 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 4991 if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
4459 return 1; 4992 return 1;
4460 return 0; 4993 return 0;
4461} 4994}
@@ -4495,19 +5028,33 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4495 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); 5028 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
4496 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); 5029 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
4497 5030
4498 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 5031 /*
5032 * SDM says that segment selectors are loaded before segment
5033 * descriptors
5034 */
5035 kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
5036 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
5037 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
5038 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
5039 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
5040
5041 /*
5042 * Now load segment descriptors. If fault happenes at this stage
5043 * it is handled in a context of new task
5044 */
5045 if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
4499 return 1; 5046 return 1;
4500 5047
4501 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 5048 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
4502 return 1; 5049 return 1;
4503 5050
4504 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 5051 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
4505 return 1; 5052 return 1;
4506 5053
4507 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 5054 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
4508 return 1; 5055 return 1;
4509 5056
4510 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 5057 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
4511 return 1; 5058 return 1;
4512 return 0; 5059 return 0;
4513} 5060}
@@ -4529,7 +5076,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
4529 sizeof tss_segment_16)) 5076 sizeof tss_segment_16))
4530 goto out; 5077 goto out;
4531 5078
4532 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 5079 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
4533 &tss_segment_16, sizeof tss_segment_16)) 5080 &tss_segment_16, sizeof tss_segment_16))
4534 goto out; 5081 goto out;
4535 5082
@@ -4537,7 +5084,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
4537 tss_segment_16.prev_task_link = old_tss_sel; 5084 tss_segment_16.prev_task_link = old_tss_sel;
4538 5085
4539 if (kvm_write_guest(vcpu->kvm, 5086 if (kvm_write_guest(vcpu->kvm,
4540 get_tss_base_addr(vcpu, nseg_desc), 5087 get_tss_base_addr_write(vcpu, nseg_desc),
4541 &tss_segment_16.prev_task_link, 5088 &tss_segment_16.prev_task_link,
4542 sizeof tss_segment_16.prev_task_link)) 5089 sizeof tss_segment_16.prev_task_link))
4543 goto out; 5090 goto out;
@@ -4568,7 +5115,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
4568 sizeof tss_segment_32)) 5115 sizeof tss_segment_32))
4569 goto out; 5116 goto out;
4570 5117
4571 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 5118 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
4572 &tss_segment_32, sizeof tss_segment_32)) 5119 &tss_segment_32, sizeof tss_segment_32))
4573 goto out; 5120 goto out;
4574 5121
@@ -4576,7 +5123,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
4576 tss_segment_32.prev_task_link = old_tss_sel; 5123 tss_segment_32.prev_task_link = old_tss_sel;
4577 5124
4578 if (kvm_write_guest(vcpu->kvm, 5125 if (kvm_write_guest(vcpu->kvm,
4579 get_tss_base_addr(vcpu, nseg_desc), 5126 get_tss_base_addr_write(vcpu, nseg_desc),
4580 &tss_segment_32.prev_task_link, 5127 &tss_segment_32.prev_task_link,
4581 sizeof tss_segment_32.prev_task_link)) 5128 sizeof tss_segment_32.prev_task_link))
4582 goto out; 5129 goto out;
@@ -4599,7 +5146,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4599 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); 5146 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
4600 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); 5147 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
4601 5148
4602 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); 5149 old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
4603 5150
4604 /* FIXME: Handle errors. Failure to read either TSS or their 5151 /* FIXME: Handle errors. Failure to read either TSS or their
4605 * descriptors should generate a pagefault. 5152 * descriptors should generate a pagefault.
@@ -4658,7 +5205,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4658 &nseg_desc); 5205 &nseg_desc);
4659 } 5206 }
4660 5207
4661 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 5208 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
4662 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 5209 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
4663 tr_seg.type = 11; 5210 tr_seg.type = 11;
4664 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 5211 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
@@ -4689,17 +5236,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4689 5236
4690 kvm_set_cr8(vcpu, sregs->cr8); 5237 kvm_set_cr8(vcpu, sregs->cr8);
4691 5238
4692 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 5239 mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
4693 kvm_x86_ops->set_efer(vcpu, sregs->efer); 5240 kvm_x86_ops->set_efer(vcpu, sregs->efer);
4694 kvm_set_apic_base(vcpu, sregs->apic_base); 5241 kvm_set_apic_base(vcpu, sregs->apic_base);
4695 5242
4696 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 5243 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
4697
4698 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
4699 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 5244 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
4700 vcpu->arch.cr0 = sregs->cr0; 5245 vcpu->arch.cr0 = sregs->cr0;
4701 5246
4702 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 5247 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
4703 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5248 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4704 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5249 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
4705 load_pdptrs(vcpu, vcpu->arch.cr3); 5250 load_pdptrs(vcpu, vcpu->arch.cr3);
@@ -4734,7 +5279,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4734 /* Older userspace won't unhalt the vcpu on reset. */ 5279 /* Older userspace won't unhalt the vcpu on reset. */
4735 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && 5280 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
4736 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 5281 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4737 !(vcpu->arch.cr0 & X86_CR0_PE)) 5282 !is_protmode(vcpu))
4738 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5283 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4739 5284
4740 vcpu_put(vcpu); 5285 vcpu_put(vcpu);
@@ -4832,11 +5377,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
4832{ 5377{
4833 unsigned long vaddr = tr->linear_address; 5378 unsigned long vaddr = tr->linear_address;
4834 gpa_t gpa; 5379 gpa_t gpa;
5380 int idx;
4835 5381
4836 vcpu_load(vcpu); 5382 vcpu_load(vcpu);
4837 down_read(&vcpu->kvm->slots_lock); 5383 idx = srcu_read_lock(&vcpu->kvm->srcu);
4838 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 5384 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
4839 up_read(&vcpu->kvm->slots_lock); 5385 srcu_read_unlock(&vcpu->kvm->srcu, idx);
4840 tr->physical_address = gpa; 5386 tr->physical_address = gpa;
4841 tr->valid = gpa != UNMAPPED_GVA; 5387 tr->valid = gpa != UNMAPPED_GVA;
4842 tr->writeable = 1; 5388 tr->writeable = 1;
@@ -4917,14 +5463,14 @@ EXPORT_SYMBOL_GPL(fx_init);
4917 5463
4918void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 5464void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
4919{ 5465{
4920 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 5466 if (vcpu->guest_fpu_loaded)
4921 return; 5467 return;
4922 5468
4923 vcpu->guest_fpu_loaded = 1; 5469 vcpu->guest_fpu_loaded = 1;
4924 kvm_fx_save(&vcpu->arch.host_fx_image); 5470 kvm_fx_save(&vcpu->arch.host_fx_image);
4925 kvm_fx_restore(&vcpu->arch.guest_fx_image); 5471 kvm_fx_restore(&vcpu->arch.guest_fx_image);
5472 trace_kvm_fpu(1);
4926} 5473}
4927EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
4928 5474
4929void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 5475void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4930{ 5476{
@@ -4935,8 +5481,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4935 kvm_fx_save(&vcpu->arch.guest_fx_image); 5481 kvm_fx_save(&vcpu->arch.guest_fx_image);
4936 kvm_fx_restore(&vcpu->arch.host_fx_image); 5482 kvm_fx_restore(&vcpu->arch.host_fx_image);
4937 ++vcpu->stat.fpu_reload; 5483 ++vcpu->stat.fpu_reload;
5484 set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
5485 trace_kvm_fpu(0);
4938} 5486}
4939EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
4940 5487
4941void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 5488void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
4942{ 5489{
@@ -5088,11 +5635,13 @@ fail:
5088 5635
5089void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 5636void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
5090{ 5637{
5638 int idx;
5639
5091 kfree(vcpu->arch.mce_banks); 5640 kfree(vcpu->arch.mce_banks);
5092 kvm_free_lapic(vcpu); 5641 kvm_free_lapic(vcpu);
5093 down_read(&vcpu->kvm->slots_lock); 5642 idx = srcu_read_lock(&vcpu->kvm->srcu);
5094 kvm_mmu_destroy(vcpu); 5643 kvm_mmu_destroy(vcpu);
5095 up_read(&vcpu->kvm->slots_lock); 5644 srcu_read_unlock(&vcpu->kvm->srcu, idx);
5096 free_page((unsigned long)vcpu->arch.pio_data); 5645 free_page((unsigned long)vcpu->arch.pio_data);
5097} 5646}
5098 5647
@@ -5103,6 +5652,12 @@ struct kvm *kvm_arch_create_vm(void)
5103 if (!kvm) 5652 if (!kvm)
5104 return ERR_PTR(-ENOMEM); 5653 return ERR_PTR(-ENOMEM);
5105 5654
5655 kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
5656 if (!kvm->arch.aliases) {
5657 kfree(kvm);
5658 return ERR_PTR(-ENOMEM);
5659 }
5660
5106 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 5661 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5107 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 5662 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5108 5663
@@ -5159,16 +5714,18 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
5159 put_page(kvm->arch.apic_access_page); 5714 put_page(kvm->arch.apic_access_page);
5160 if (kvm->arch.ept_identity_pagetable) 5715 if (kvm->arch.ept_identity_pagetable)
5161 put_page(kvm->arch.ept_identity_pagetable); 5716 put_page(kvm->arch.ept_identity_pagetable);
5717 cleanup_srcu_struct(&kvm->srcu);
5718 kfree(kvm->arch.aliases);
5162 kfree(kvm); 5719 kfree(kvm);
5163} 5720}
5164 5721
5165int kvm_arch_set_memory_region(struct kvm *kvm, 5722int kvm_arch_prepare_memory_region(struct kvm *kvm,
5166 struct kvm_userspace_memory_region *mem, 5723 struct kvm_memory_slot *memslot,
5167 struct kvm_memory_slot old, 5724 struct kvm_memory_slot old,
5725 struct kvm_userspace_memory_region *mem,
5168 int user_alloc) 5726 int user_alloc)
5169{ 5727{
5170 int npages = mem->memory_size >> PAGE_SHIFT; 5728 int npages = memslot->npages;
5171 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
5172 5729
5173 /*To keep backward compatibility with older userspace, 5730 /*To keep backward compatibility with older userspace,
5174 *x86 needs to hanlde !user_alloc case. 5731 *x86 needs to hanlde !user_alloc case.
@@ -5188,26 +5745,35 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
5188 if (IS_ERR((void *)userspace_addr)) 5745 if (IS_ERR((void *)userspace_addr))
5189 return PTR_ERR((void *)userspace_addr); 5746 return PTR_ERR((void *)userspace_addr);
5190 5747
5191 /* set userspace_addr atomically for kvm_hva_to_rmapp */
5192 spin_lock(&kvm->mmu_lock);
5193 memslot->userspace_addr = userspace_addr; 5748 memslot->userspace_addr = userspace_addr;
5194 spin_unlock(&kvm->mmu_lock);
5195 } else {
5196 if (!old.user_alloc && old.rmap) {
5197 int ret;
5198
5199 down_write(&current->mm->mmap_sem);
5200 ret = do_munmap(current->mm, old.userspace_addr,
5201 old.npages * PAGE_SIZE);
5202 up_write(&current->mm->mmap_sem);
5203 if (ret < 0)
5204 printk(KERN_WARNING
5205 "kvm_vm_ioctl_set_memory_region: "
5206 "failed to munmap memory\n");
5207 }
5208 } 5749 }
5209 } 5750 }
5210 5751
5752
5753 return 0;
5754}
5755
5756void kvm_arch_commit_memory_region(struct kvm *kvm,
5757 struct kvm_userspace_memory_region *mem,
5758 struct kvm_memory_slot old,
5759 int user_alloc)
5760{
5761
5762 int npages = mem->memory_size >> PAGE_SHIFT;
5763
5764 if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
5765 int ret;
5766
5767 down_write(&current->mm->mmap_sem);
5768 ret = do_munmap(current->mm, old.userspace_addr,
5769 old.npages * PAGE_SIZE);
5770 up_write(&current->mm->mmap_sem);
5771 if (ret < 0)
5772 printk(KERN_WARNING
5773 "kvm_vm_ioctl_set_memory_region: "
5774 "failed to munmap memory\n");
5775 }
5776
5211 spin_lock(&kvm->mmu_lock); 5777 spin_lock(&kvm->mmu_lock);
5212 if (!kvm->arch.n_requested_mmu_pages) { 5778 if (!kvm->arch.n_requested_mmu_pages) {
5213 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 5779 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
@@ -5216,8 +5782,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
5216 5782
5217 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 5783 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
5218 spin_unlock(&kvm->mmu_lock); 5784 spin_unlock(&kvm->mmu_lock);
5219
5220 return 0;
5221} 5785}
5222 5786
5223void kvm_arch_flush_shadow(struct kvm *kvm) 5787void kvm_arch_flush_shadow(struct kvm *kvm)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 5eadea585d2a..2d101639bd8d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -2,6 +2,7 @@
2#define ARCH_X86_KVM_X86_H 2#define ARCH_X86_KVM_X86_H
3 3
4#include <linux/kvm_host.h> 4#include <linux/kvm_host.h>
5#include "kvm_cache_regs.h"
5 6
6static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) 7static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
7{ 8{
@@ -35,4 +36,33 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
35struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 36struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
36 u32 function, u32 index); 37 u32 function, u32 index);
37 38
39static inline bool is_protmode(struct kvm_vcpu *vcpu)
40{
41 return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
42}
43
44static inline int is_long_mode(struct kvm_vcpu *vcpu)
45{
46#ifdef CONFIG_X86_64
47 return vcpu->arch.efer & EFER_LMA;
48#else
49 return 0;
50#endif
51}
52
53static inline int is_pae(struct kvm_vcpu *vcpu)
54{
55 return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
56}
57
58static inline int is_pse(struct kvm_vcpu *vcpu)
59{
60 return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
61}
62
63static inline int is_paging(struct kvm_vcpu *vcpu)
64{
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66}
67
38#endif 68#endif
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index cffd754f3039..419386c24b82 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -14,7 +14,7 @@ $(obj)/inat.o: $(obj)/inat-tables.c
14 14
15clean-files := inat-tables.c 15clean-files := inat-tables.c
16 16
17obj-$(CONFIG_SMP) += msr-smp.o 17obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
18 18
19lib-y := delay.o 19lib-y := delay.o
20lib-y += thunk_$(BITS).o 20lib-y += thunk_$(BITS).o
@@ -34,9 +34,10 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y)
34endif 34endif
35 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o 35 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
36else 36else
37 obj-y += io_64.o iomap_copy_64.o 37 obj-y += iomap_copy_64.o
38 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o 38 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
39 lib-y += thunk_64.o clear_page_64.o copy_page_64.o 39 lib-y += thunk_64.o clear_page_64.o copy_page_64.o
40 lib-y += memmove_64.o memset_64.o 40 lib-y += memmove_64.o memset_64.o
41 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o 41 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
42 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
42endif 43endif
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
new file mode 100644
index 000000000000..a3c668875038
--- /dev/null
+++ b/arch/x86/lib/cache-smp.c
@@ -0,0 +1,19 @@
1#include <linux/smp.h>
2#include <linux/module.h>
3
4static void __wbinvd(void *dummy)
5{
6 wbinvd();
7}
8
9void wbinvd_on_cpu(int cpu)
10{
11 smp_call_function_single(cpu, __wbinvd, NULL, 1);
12}
13EXPORT_SYMBOL(wbinvd_on_cpu);
14
15int wbinvd_on_all_cpus(void)
16{
17 return on_each_cpu(__wbinvd, NULL, 1);
18}
19EXPORT_SYMBOL(wbinvd_on_all_cpus);
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index cf889d4e076a..71100c98e337 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -90,12 +90,6 @@ ENTRY(_copy_from_user)
90 CFI_ENDPROC 90 CFI_ENDPROC
91ENDPROC(_copy_from_user) 91ENDPROC(_copy_from_user)
92 92
93ENTRY(copy_user_generic)
94 CFI_STARTPROC
95 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
96 CFI_ENDPROC
97ENDPROC(copy_user_generic)
98
99 .section .fixup,"ax" 93 .section .fixup,"ax"
100 /* must zero dest */ 94 /* must zero dest */
101ENTRY(bad_from_user) 95ENTRY(bad_from_user)
diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c
deleted file mode 100644
index 3f1eb59b5f08..000000000000
--- a/arch/x86/lib/io_64.c
+++ /dev/null
@@ -1,25 +0,0 @@
1#include <linux/string.h>
2#include <linux/module.h>
3#include <asm/io.h>
4
5void __memcpy_toio(unsigned long dst, const void *src, unsigned len)
6{
7 __inline_memcpy((void *)dst, src, len);
8}
9EXPORT_SYMBOL(__memcpy_toio);
10
11void __memcpy_fromio(void *dst, unsigned long src, unsigned len)
12{
13 __inline_memcpy(dst, (const void *)src, len);
14}
15EXPORT_SYMBOL(__memcpy_fromio);
16
17void memset_io(volatile void __iomem *a, int b, size_t c)
18{
19 /*
20 * TODO: memset can mangle the IO patterns quite a bit.
21 * perhaps it would be better to use a dumb one:
22 */
23 memset((void *)a, b, c);
24}
25EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index ad5441ed1b57..f82e884928af 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -20,12 +20,11 @@
20/* 20/*
21 * memcpy_c() - fast string ops (REP MOVSQ) based variant. 21 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
22 * 22 *
23 * Calls to this get patched into the kernel image via the 23 * This gets patched over the unrolled variant (below) via the
24 * alternative instructions framework: 24 * alternative instructions framework:
25 */ 25 */
26 ALIGN 26 .section .altinstr_replacement, "ax", @progbits
27memcpy_c: 27.Lmemcpy_c:
28 CFI_STARTPROC
29 movq %rdi, %rax 28 movq %rdi, %rax
30 29
31 movl %edx, %ecx 30 movl %edx, %ecx
@@ -35,8 +34,8 @@ memcpy_c:
35 movl %edx, %ecx 34 movl %edx, %ecx
36 rep movsb 35 rep movsb
37 ret 36 ret
38 CFI_ENDPROC 37.Lmemcpy_e:
39ENDPROC(memcpy_c) 38 .previous
40 39
41ENTRY(__memcpy) 40ENTRY(__memcpy)
42ENTRY(memcpy) 41ENTRY(memcpy)
@@ -128,16 +127,10 @@ ENDPROC(__memcpy)
128 * It is also a lot simpler. Use this when possible: 127 * It is also a lot simpler. Use this when possible:
129 */ 128 */
130 129
131 .section .altinstr_replacement, "ax"
1321: .byte 0xeb /* jmp <disp8> */
133 .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
1342:
135 .previous
136
137 .section .altinstructions, "a" 130 .section .altinstructions, "a"
138 .align 8 131 .align 8
139 .quad memcpy 132 .quad memcpy
140 .quad 1b 133 .quad .Lmemcpy_c
141 .byte X86_FEATURE_REP_GOOD 134 .byte X86_FEATURE_REP_GOOD
142 135
143 /* 136 /*
@@ -145,6 +138,6 @@ ENDPROC(__memcpy)
145 * so it is silly to overwrite itself with nops - reboot is the 138 * so it is silly to overwrite itself with nops - reboot is the
146 * only outcome... 139 * only outcome...
147 */ 140 */
148 .byte 2b - 1b 141 .byte .Lmemcpy_e - .Lmemcpy_c
149 .byte 2b - 1b 142 .byte .Lmemcpy_e - .Lmemcpy_c
150 .previous 143 .previous
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 2c5948116bd2..e88d3b81644a 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -12,9 +12,8 @@
12 * 12 *
13 * rax original destination 13 * rax original destination
14 */ 14 */
15 ALIGN 15 .section .altinstr_replacement, "ax", @progbits
16memset_c: 16.Lmemset_c:
17 CFI_STARTPROC
18 movq %rdi,%r9 17 movq %rdi,%r9
19 movl %edx,%r8d 18 movl %edx,%r8d
20 andl $7,%r8d 19 andl $7,%r8d
@@ -29,8 +28,8 @@ memset_c:
29 rep stosb 28 rep stosb
30 movq %r9,%rax 29 movq %r9,%rax
31 ret 30 ret
32 CFI_ENDPROC 31.Lmemset_e:
33ENDPROC(memset_c) 32 .previous
34 33
35ENTRY(memset) 34ENTRY(memset)
36ENTRY(__memset) 35ENTRY(__memset)
@@ -118,16 +117,11 @@ ENDPROC(__memset)
118 117
119#include <asm/cpufeature.h> 118#include <asm/cpufeature.h>
120 119
121 .section .altinstr_replacement,"ax"
1221: .byte 0xeb /* jmp <disp8> */
123 .byte (memset_c - memset) - (2f - 1b) /* offset */
1242:
125 .previous
126 .section .altinstructions,"a" 120 .section .altinstructions,"a"
127 .align 8 121 .align 8
128 .quad memset 122 .quad memset
129 .quad 1b 123 .quad .Lmemset_c
130 .byte X86_FEATURE_REP_GOOD 124 .byte X86_FEATURE_REP_GOOD
131 .byte .Lfinal - memset 125 .byte .Lfinal - memset
132 .byte 2b - 1b 126 .byte .Lmemset_e - .Lmemset_c
133 .previous 127 .previous
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
new file mode 100644
index 000000000000..15acecf0d7aa
--- /dev/null
+++ b/arch/x86/lib/rwsem_64.S
@@ -0,0 +1,81 @@
1/*
2 * x86-64 rwsem wrappers
3 *
4 * This interfaces the inline asm code to the slow-path
5 * C routines. We need to save the call-clobbered regs
6 * that the asm does not mark as clobbered, and move the
7 * argument from %rax to %rdi.
8 *
9 * NOTE! We don't need to save %rax, because the functions
10 * will always return the semaphore pointer in %rax (which
11 * is also the input argument to these helpers)
12 *
13 * The following can clobber %rdx because the asm clobbers it:
14 * call_rwsem_down_write_failed
15 * call_rwsem_wake
16 * but %rdi, %rsi, %rcx, %r8-r11 always need saving.
17 */
18
19#include <linux/linkage.h>
20#include <asm/rwlock.h>
21#include <asm/alternative-asm.h>
22#include <asm/frame.h>
23#include <asm/dwarf2.h>
24
25#define save_common_regs \
26 pushq %rdi; \
27 pushq %rsi; \
28 pushq %rcx; \
29 pushq %r8; \
30 pushq %r9; \
31 pushq %r10; \
32 pushq %r11
33
34#define restore_common_regs \
35 popq %r11; \
36 popq %r10; \
37 popq %r9; \
38 popq %r8; \
39 popq %rcx; \
40 popq %rsi; \
41 popq %rdi
42
43/* Fix up special calling conventions */
44ENTRY(call_rwsem_down_read_failed)
45 save_common_regs
46 pushq %rdx
47 movq %rax,%rdi
48 call rwsem_down_read_failed
49 popq %rdx
50 restore_common_regs
51 ret
52 ENDPROC(call_rwsem_down_read_failed)
53
54ENTRY(call_rwsem_down_write_failed)
55 save_common_regs
56 movq %rax,%rdi
57 call rwsem_down_write_failed
58 restore_common_regs
59 ret
60 ENDPROC(call_rwsem_down_write_failed)
61
62ENTRY(call_rwsem_wake)
63 decw %dx /* do nothing if still outstanding active readers */
64 jnz 1f
65 save_common_regs
66 movq %rax,%rdi
67 call rwsem_wake
68 restore_common_regs
691: ret
70 ENDPROC(call_rwsem_wake)
71
72/* Fix up special calling conventions */
73ENTRY(call_rwsem_downgrade_wake)
74 save_common_regs
75 pushq %rdx
76 movq %rax,%rdi
77 call rwsem_downgrade_wake
78 popq %rdx
79 restore_common_regs
80 ret
81 ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index d406c5239019..e71c5cbc8f35 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -266,16 +266,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
266 if (!after_bootmem) 266 if (!after_bootmem)
267 find_early_table_space(end, use_pse, use_gbpages); 267 find_early_table_space(end, use_pse, use_gbpages);
268 268
269#ifdef CONFIG_X86_32
270 for (i = 0; i < nr_range; i++)
271 kernel_physical_mapping_init(mr[i].start, mr[i].end,
272 mr[i].page_size_mask);
273 ret = end;
274#else /* CONFIG_X86_64 */
275 for (i = 0; i < nr_range; i++) 269 for (i = 0; i < nr_range; i++)
276 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, 270 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
277 mr[i].page_size_mask); 271 mr[i].page_size_mask);
278#endif
279 272
280#ifdef CONFIG_X86_32 273#ifdef CONFIG_X86_32
281 early_ioremap_page_table_range_init(); 274 early_ioremap_page_table_range_init();
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9a0c258a86be..5cb3f0f54f47 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -241,6 +241,7 @@ kernel_physical_mapping_init(unsigned long start,
241 unsigned long page_size_mask) 241 unsigned long page_size_mask)
242{ 242{
243 int use_pse = page_size_mask == (1<<PG_LEVEL_2M); 243 int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
244 unsigned long last_map_addr = end;
244 unsigned long start_pfn, end_pfn; 245 unsigned long start_pfn, end_pfn;
245 pgd_t *pgd_base = swapper_pg_dir; 246 pgd_t *pgd_base = swapper_pg_dir;
246 int pgd_idx, pmd_idx, pte_ofs; 247 int pgd_idx, pmd_idx, pte_ofs;
@@ -341,9 +342,10 @@ repeat:
341 prot = PAGE_KERNEL_EXEC; 342 prot = PAGE_KERNEL_EXEC;
342 343
343 pages_4k++; 344 pages_4k++;
344 if (mapping_iter == 1) 345 if (mapping_iter == 1) {
345 set_pte(pte, pfn_pte(pfn, init_prot)); 346 set_pte(pte, pfn_pte(pfn, init_prot));
346 else 347 last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
348 } else
347 set_pte(pte, pfn_pte(pfn, prot)); 349 set_pte(pte, pfn_pte(pfn, prot));
348 } 350 }
349 } 351 }
@@ -368,7 +370,7 @@ repeat:
368 mapping_iter = 2; 370 mapping_iter = 2;
369 goto repeat; 371 goto repeat;
370 } 372 }
371 return 0; 373 return last_map_addr;
372} 374}
373 375
374pte_t *kmap_pte; 376pte_t *kmap_pte;
@@ -748,6 +750,7 @@ static void __init zone_sizes_init(void)
748 free_area_init_nodes(max_zone_pfns); 750 free_area_init_nodes(max_zone_pfns);
749} 751}
750 752
753#ifndef CONFIG_NO_BOOTMEM
751static unsigned long __init setup_node_bootmem(int nodeid, 754static unsigned long __init setup_node_bootmem(int nodeid,
752 unsigned long start_pfn, 755 unsigned long start_pfn,
753 unsigned long end_pfn, 756 unsigned long end_pfn,
@@ -764,13 +767,14 @@ static unsigned long __init setup_node_bootmem(int nodeid,
764 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n", 767 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
765 nodeid, bootmap, bootmap + bootmap_size); 768 nodeid, bootmap, bootmap + bootmap_size);
766 free_bootmem_with_active_regions(nodeid, end_pfn); 769 free_bootmem_with_active_regions(nodeid, end_pfn);
767 early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
768 770
769 return bootmap + bootmap_size; 771 return bootmap + bootmap_size;
770} 772}
773#endif
771 774
772void __init setup_bootmem_allocator(void) 775void __init setup_bootmem_allocator(void)
773{ 776{
777#ifndef CONFIG_NO_BOOTMEM
774 int nodeid; 778 int nodeid;
775 unsigned long bootmap_size, bootmap; 779 unsigned long bootmap_size, bootmap;
776 /* 780 /*
@@ -782,11 +786,13 @@ void __init setup_bootmem_allocator(void)
782 if (bootmap == -1L) 786 if (bootmap == -1L)
783 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 787 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
784 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); 788 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
789#endif
785 790
786 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 791 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
787 max_pfn_mapped<<PAGE_SHIFT); 792 max_pfn_mapped<<PAGE_SHIFT);
788 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); 793 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
789 794
795#ifndef CONFIG_NO_BOOTMEM
790 for_each_online_node(nodeid) { 796 for_each_online_node(nodeid) {
791 unsigned long start_pfn, end_pfn; 797 unsigned long start_pfn, end_pfn;
792 798
@@ -804,6 +810,7 @@ void __init setup_bootmem_allocator(void)
804 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, 810 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
805 bootmap); 811 bootmap);
806 } 812 }
813#endif
807 814
808 after_bootmem = 1; 815 after_bootmem = 1;
809} 816}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 69ddfbd91135..e9b040e1cde5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -572,6 +572,7 @@ kernel_physical_mapping_init(unsigned long start,
572void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 572void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
573 int acpi, int k8) 573 int acpi, int k8)
574{ 574{
575#ifndef CONFIG_NO_BOOTMEM
575 unsigned long bootmap_size, bootmap; 576 unsigned long bootmap_size, bootmap;
576 577
577 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; 578 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
@@ -579,13 +580,15 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
579 PAGE_SIZE); 580 PAGE_SIZE);
580 if (bootmap == -1L) 581 if (bootmap == -1L)
581 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 582 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
583 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
582 /* don't touch min_low_pfn */ 584 /* don't touch min_low_pfn */
583 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, 585 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
584 0, end_pfn); 586 0, end_pfn);
585 e820_register_active_regions(0, start_pfn, end_pfn); 587 e820_register_active_regions(0, start_pfn, end_pfn);
586 free_bootmem_with_active_regions(0, end_pfn); 588 free_bootmem_with_active_regions(0, end_pfn);
587 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); 589#else
588 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); 590 e820_register_active_regions(0, start_pfn, end_pfn);
591#endif
589} 592}
590#endif 593#endif
591 594
@@ -974,7 +977,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
974 if (pmd_none(*pmd)) { 977 if (pmd_none(*pmd)) {
975 pte_t entry; 978 pte_t entry;
976 979
977 p = vmemmap_alloc_block(PMD_SIZE, node); 980 p = vmemmap_alloc_block_buf(PMD_SIZE, node);
978 if (!p) 981 if (!p)
979 return -ENOMEM; 982 return -ENOMEM;
980 983
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c246d259822d..5eb1ba74a3a9 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,43 +24,6 @@
24 24
25#include "physaddr.h" 25#include "physaddr.h"
26 26
27int page_is_ram(unsigned long pagenr)
28{
29 resource_size_t addr, end;
30 int i;
31
32 /*
33 * A special case is the first 4Kb of memory;
34 * This is a BIOS owned area, not kernel ram, but generally
35 * not listed as such in the E820 table.
36 */
37 if (pagenr == 0)
38 return 0;
39
40 /*
41 * Second special case: Some BIOSen report the PC BIOS
42 * area (640->1Mb) as ram even though it is not.
43 */
44 if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
45 pagenr < (BIOS_END >> PAGE_SHIFT))
46 return 0;
47
48 for (i = 0; i < e820.nr_map; i++) {
49 /*
50 * Not usable memory:
51 */
52 if (e820.map[i].type != E820_RAM)
53 continue;
54 addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
55 end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
56
57
58 if ((pagenr >= addr) && (pagenr < end))
59 return 1;
60 }
61 return 0;
62}
63
64/* 27/*
65 * Fix up the linear direct mapping of the kernel to avoid cache attribute 28 * Fix up the linear direct mapping of the kernel to avoid cache attribute
66 * conflicts. 29 * conflicts.
@@ -422,6 +385,10 @@ void __init early_ioremap_init(void)
422 * The boot-ioremap range spans multiple pmds, for which 385 * The boot-ioremap range spans multiple pmds, for which
423 * we are not prepared: 386 * we are not prepared:
424 */ 387 */
388#define __FIXADDR_TOP (-PAGE_SIZE)
389 BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
390 != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
391#undef __FIXADDR_TOP
425 if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { 392 if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
426 WARN_ON(1); 393 WARN_ON(1);
427 printk(KERN_WARNING "pmd %p != %p\n", 394 printk(KERN_WARNING "pmd %p != %p\n",
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index 8cc183344140..b3b531a4f8e5 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -337,7 +337,7 @@ bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
337 if (!shadow) 337 if (!shadow)
338 return true; 338 return true;
339 339
340 status = kmemcheck_shadow_test(shadow, size); 340 status = kmemcheck_shadow_test_all(shadow, size);
341 341
342 return status == KMEMCHECK_SHADOW_INITIALIZED; 342 return status == KMEMCHECK_SHADOW_INITIALIZED;
343} 343}
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
index 3f66b82076a3..aec124214d97 100644
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -125,12 +125,12 @@ void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
125 125
126enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) 126enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
127{ 127{
128#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
128 uint8_t *x; 129 uint8_t *x;
129 unsigned int i; 130 unsigned int i;
130 131
131 x = shadow; 132 x = shadow;
132 133
133#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
134 /* 134 /*
135 * Make sure _some_ bytes are initialized. Gcc frequently generates 135 * Make sure _some_ bytes are initialized. Gcc frequently generates
136 * code to access neighboring bytes. 136 * code to access neighboring bytes.
@@ -139,13 +139,25 @@ enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
139 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) 139 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
140 return x[i]; 140 return x[i];
141 } 141 }
142
143 return x[0];
142#else 144#else
145 return kmemcheck_shadow_test_all(shadow, size);
146#endif
147}
148
149enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
150{
151 uint8_t *x;
152 unsigned int i;
153
154 x = shadow;
155
143 /* All bytes must be initialized. */ 156 /* All bytes must be initialized. */
144 for (i = 0; i < size; ++i) { 157 for (i = 0; i < size; ++i) {
145 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) 158 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
146 return x[i]; 159 return x[i];
147 } 160 }
148#endif
149 161
150 return x[0]; 162 return x[0];
151} 163}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
index af46d9ab9d86..ff0b2f70fbcb 100644
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -11,6 +11,8 @@ enum kmemcheck_shadow {
11void *kmemcheck_shadow_lookup(unsigned long address); 11void *kmemcheck_shadow_lookup(unsigned long address);
12 12
13enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); 13enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
14enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
15 unsigned int size);
14void kmemcheck_shadow_set(void *shadow, unsigned int size); 16void kmemcheck_shadow_set(void *shadow, unsigned int size);
15 17
16#endif 18#endif
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index c8191defc38a..1dab5194fd9d 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -71,7 +71,7 @@ static int mmap_is_legacy(void)
71 if (current->personality & ADDR_COMPAT_LAYOUT) 71 if (current->personality & ADDR_COMPAT_LAYOUT)
72 return 1; 72 return 1;
73 73
74 if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) 74 if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
75 return 1; 75 return 1;
76 76
77 return sysctl_legacy_va_layout; 77 return sysctl_legacy_va_layout;
@@ -96,7 +96,7 @@ static unsigned long mmap_rnd(void)
96 96
97static unsigned long mmap_base(void) 97static unsigned long mmap_base(void)
98{ 98{
99 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; 99 unsigned long gap = rlimit(RLIMIT_STACK);
100 100
101 if (gap < MIN_GAP) 101 if (gap < MIN_GAP)
102 gap = MIN_GAP; 102 gap = MIN_GAP;
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index b20760ca7244..809baaaf48b1 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -418,7 +418,10 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
418 418
419 for_each_online_node(nid) { 419 for_each_online_node(nid) {
420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
421 NODE_DATA(nid)->node_id = nid;
422#ifndef CONFIG_NO_BOOTMEM
421 NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; 423 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
424#endif
422 } 425 }
423 426
424 setup_bootmem_allocator(); 427 setup_bootmem_allocator();
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 83bbc70d11bb..8948f47fde05 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -163,30 +163,48 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
163 unsigned long end, unsigned long size, 163 unsigned long end, unsigned long size,
164 unsigned long align) 164 unsigned long align)
165{ 165{
166 unsigned long mem = find_e820_area(start, end, size, align); 166 unsigned long mem;
167 void *ptr;
168 167
168 /*
169 * put it on high as possible
170 * something will go with NODE_DATA
171 */
172 if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
173 start = MAX_DMA_PFN<<PAGE_SHIFT;
174 if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
175 end > (MAX_DMA32_PFN<<PAGE_SHIFT))
176 start = MAX_DMA32_PFN<<PAGE_SHIFT;
177 mem = find_e820_area(start, end, size, align);
178 if (mem != -1L)
179 return __va(mem);
180
181 /* extend the search scope */
182 end = max_pfn_mapped << PAGE_SHIFT;
183 if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
184 start = MAX_DMA32_PFN<<PAGE_SHIFT;
185 else
186 start = MAX_DMA_PFN<<PAGE_SHIFT;
187 mem = find_e820_area(start, end, size, align);
169 if (mem != -1L) 188 if (mem != -1L)
170 return __va(mem); 189 return __va(mem);
171 190
172 ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); 191 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
173 if (ptr == NULL) {
174 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
175 size, nodeid); 192 size, nodeid);
176 return NULL; 193
177 } 194 return NULL;
178 return ptr;
179} 195}
180 196
181/* Initialize bootmem allocator for a node */ 197/* Initialize bootmem allocator for a node */
182void __init 198void __init
183setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 199setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
184{ 200{
185 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 201 unsigned long start_pfn, last_pfn, nodedata_phys;
186 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 202 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
187 unsigned long bootmap_start, nodedata_phys;
188 void *bootmap;
189 int nid; 203 int nid;
204#ifndef CONFIG_NO_BOOTMEM
205 unsigned long bootmap_start, bootmap_pages, bootmap_size;
206 void *bootmap;
207#endif
190 208
191 if (!end) 209 if (!end)
192 return; 210 return;
@@ -200,7 +218,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
200 218
201 start = roundup(start, ZONE_ALIGN); 219 start = roundup(start, ZONE_ALIGN);
202 220
203 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 221 printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
204 start, end); 222 start, end);
205 223
206 start_pfn = start >> PAGE_SHIFT; 224 start_pfn = start >> PAGE_SHIFT;
@@ -211,14 +229,21 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
211 if (node_data[nodeid] == NULL) 229 if (node_data[nodeid] == NULL)
212 return; 230 return;
213 nodedata_phys = __pa(node_data[nodeid]); 231 nodedata_phys = __pa(node_data[nodeid]);
232 reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
214 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, 233 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
215 nodedata_phys + pgdat_size - 1); 234 nodedata_phys + pgdat_size - 1);
235 nid = phys_to_nid(nodedata_phys);
236 if (nid != nodeid)
237 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
216 238
217 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 239 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
218 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; 240 NODE_DATA(nodeid)->node_id = nodeid;
219 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 241 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
220 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 242 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
221 243
244#ifndef CONFIG_NO_BOOTMEM
245 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
246
222 /* 247 /*
223 * Find a place for the bootmem map 248 * Find a place for the bootmem map
224 * nodedata_phys could be on other nodes by alloc_bootmem, 249 * nodedata_phys could be on other nodes by alloc_bootmem,
@@ -227,11 +252,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
227 * of alloc_bootmem, that could clash with reserved range 252 * of alloc_bootmem, that could clash with reserved range
228 */ 253 */
229 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); 254 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
230 nid = phys_to_nid(nodedata_phys); 255 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
231 if (nid == nodeid)
232 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
233 else
234 bootmap_start = roundup(start, PAGE_SIZE);
235 /* 256 /*
236 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like 257 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
237 * to use that to align to PAGE_SIZE 258 * to use that to align to PAGE_SIZE
@@ -239,18 +260,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 260 bootmap = early_node_mem(nodeid, bootmap_start, end,
240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); 261 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
241 if (bootmap == NULL) { 262 if (bootmap == NULL) {
242 if (nodedata_phys < start || nodedata_phys >= end) { 263 free_early(nodedata_phys, nodedata_phys + pgdat_size);
243 /*
244 * only need to free it if it is from other node
245 * bootmem
246 */
247 if (nid != nodeid)
248 free_bootmem(nodedata_phys, pgdat_size);
249 }
250 node_data[nodeid] = NULL; 264 node_data[nodeid] = NULL;
251 return; 265 return;
252 } 266 }
253 bootmap_start = __pa(bootmap); 267 bootmap_start = __pa(bootmap);
268 reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
269 "BOOTMAP");
254 270
255 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 271 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
256 bootmap_start >> PAGE_SHIFT, 272 bootmap_start >> PAGE_SHIFT,
@@ -259,31 +275,12 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
259 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 275 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
260 bootmap_start, bootmap_start + bootmap_size - 1, 276 bootmap_start, bootmap_start + bootmap_size - 1,
261 bootmap_pages); 277 bootmap_pages);
262
263 free_bootmem_with_active_regions(nodeid, end);
264
265 /*
266 * convert early reserve to bootmem reserve earlier
267 * otherwise early_node_mem could use early reserved mem
268 * on previous node
269 */
270 early_res_to_bootmem(start, end);
271
272 /*
273 * in some case early_node_mem could use alloc_bootmem
274 * to get range on other node, don't reserve that again
275 */
276 if (nid != nodeid)
277 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
278 else
279 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
280 pgdat_size, BOOTMEM_DEFAULT);
281 nid = phys_to_nid(bootmap_start); 278 nid = phys_to_nid(bootmap_start);
282 if (nid != nodeid) 279 if (nid != nodeid)
283 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); 280 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
284 else 281
285 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, 282 free_bootmem_with_active_regions(nodeid, end);
286 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); 283#endif
287 284
288 node_set_online(nodeid); 285 node_set_online(nodeid);
289} 286}
@@ -427,7 +424,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
427 * Calculate the number of big nodes that can be allocated as a result 424 * Calculate the number of big nodes that can be allocated as a result
428 * of consolidating the remainder. 425 * of consolidating the remainder.
429 */ 426 */
430 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / 427 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
431 FAKE_NODE_MIN_SIZE; 428 FAKE_NODE_MIN_SIZE;
432 429
433 size &= FAKE_NODE_MIN_HASH_MASK; 430 size &= FAKE_NODE_MIN_HASH_MASK;
@@ -502,77 +499,99 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
502} 499}
503 500
504/* 501/*
505 * Splits num_nodes nodes up equally starting at node_start. The return value 502 * Returns the end address of a node so that there is at least `size' amount of
506 * is the number of nodes split up and addr is adjusted to be at the end of the 503 * non-reserved memory or `max_addr' is reached.
507 * last node allocated.
508 */ 504 */
509static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, 505static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
510 int num_nodes)
511{ 506{
512 unsigned int big; 507 u64 end = start + size;
513 u64 size;
514 int i;
515 508
516 if (num_nodes <= 0) 509 while (end - start - e820_hole_size(start, end) < size) {
517 return -1; 510 end += FAKE_NODE_MIN_SIZE;
518 if (num_nodes > MAX_NUMNODES) 511 if (end > max_addr) {
519 num_nodes = MAX_NUMNODES;
520 size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
521 num_nodes;
522 /*
523 * Calculate the number of big nodes that can be allocated as a result
524 * of consolidating the leftovers.
525 */
526 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
527 FAKE_NODE_MIN_SIZE;
528
529 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
530 size &= FAKE_NODE_MIN_HASH_MASK;
531 if (!size) {
532 printk(KERN_ERR "Not enough memory for each node. "
533 "NUMA emulation disabled.\n");
534 return -1;
535 }
536
537 for (i = node_start; i < num_nodes + node_start; i++) {
538 u64 end = *addr + size;
539
540 if (i < big)
541 end += FAKE_NODE_MIN_SIZE;
542 /*
543 * The final node can have the remaining system RAM. Other
544 * nodes receive roughly the same amount of available pages.
545 */
546 if (i == num_nodes + node_start - 1)
547 end = max_addr; 512 end = max_addr;
548 else
549 while (end - *addr - e820_hole_size(*addr, end) <
550 size) {
551 end += FAKE_NODE_MIN_SIZE;
552 if (end > max_addr) {
553 end = max_addr;
554 break;
555 }
556 }
557 if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
558 break; 513 break;
514 }
559 } 515 }
560 return i - node_start + 1; 516 return end;
561} 517}
562 518
563/* 519/*
564 * Splits the remaining system RAM into chunks of size. The remaining memory is 520 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
565 * always assigned to a final node and can be asymmetric. Returns the number of 521 * `addr' to `max_addr'. The return value is the number of nodes allocated.
566 * nodes split.
567 */ 522 */
568static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, 523static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
569 u64 size)
570{ 524{
571 int i = node_start; 525 nodemask_t physnode_mask = NODE_MASK_NONE;
572 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 526 u64 min_size;
573 while (!setup_node_range(i++, addr, size, max_addr)) 527 int ret = 0;
574 ; 528 int i;
575 return i - node_start; 529
530 if (!size)
531 return -1;
532 /*
533 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
534 * increased accordingly if the requested size is too small. This
535 * creates a uniform distribution of node sizes across the entire
536 * machine (but not necessarily over physical nodes).
537 */
538 min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) /
539 MAX_NUMNODES;
540 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
541 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
542 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
543 FAKE_NODE_MIN_HASH_MASK;
544 if (size < min_size) {
545 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
546 size >> 20, min_size >> 20);
547 size = min_size;
548 }
549 size &= FAKE_NODE_MIN_HASH_MASK;
550
551 for (i = 0; i < MAX_NUMNODES; i++)
552 if (physnodes[i].start != physnodes[i].end)
553 node_set(i, physnode_mask);
554 /*
555 * Fill physical nodes with fake nodes of size until there is no memory
556 * left on any of them.
557 */
558 while (nodes_weight(physnode_mask)) {
559 for_each_node_mask(i, physnode_mask) {
560 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
561 u64 end;
562
563 end = find_end_of_node(physnodes[i].start,
564 physnodes[i].end, size);
565 /*
566 * If there won't be at least FAKE_NODE_MIN_SIZE of
567 * non-reserved memory in ZONE_DMA32 for the next node,
568 * this one must extend to the boundary.
569 */
570 if (end < dma32_end && dma32_end - end -
571 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
572 end = dma32_end;
573
574 /*
575 * If there won't be enough non-reserved memory for the
576 * next node, this one must extend to the end of the
577 * physical node.
578 */
579 if (physnodes[i].end - end -
580 e820_hole_size(end, physnodes[i].end) < size)
581 end = physnodes[i].end;
582
583 /*
584 * Setup the fake node that will be allocated as bootmem
585 * later. If setup_node_range() returns non-zero, there
586 * is no more memory available on this physical node.
587 */
588 if (setup_node_range(ret++, &physnodes[i].start,
589 end - physnodes[i].start,
590 physnodes[i].end) < 0)
591 node_clear(i, physnode_mask);
592 }
593 }
594 return ret;
576} 595}
577 596
578/* 597/*
@@ -582,87 +601,32 @@ static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
582static int __init numa_emulation(unsigned long start_pfn, 601static int __init numa_emulation(unsigned long start_pfn,
583 unsigned long last_pfn, int acpi, int k8) 602 unsigned long last_pfn, int acpi, int k8)
584{ 603{
585 u64 size, addr = start_pfn << PAGE_SHIFT; 604 u64 addr = start_pfn << PAGE_SHIFT;
586 u64 max_addr = last_pfn << PAGE_SHIFT; 605 u64 max_addr = last_pfn << PAGE_SHIFT;
587 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
588 int num_phys_nodes; 606 int num_phys_nodes;
607 int num_nodes;
608 int i;
589 609
590 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); 610 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
591 /* 611 /*
592 * If the numa=fake command-line is just a single number N, split the 612 * If the numa=fake command-line contains a 'M' or 'G', it represents
593 * system RAM into N fake nodes. 613 * the fixed node size. Otherwise, if it is just a single number N,
614 * split the system RAM into N fake nodes.
594 */ 615 */
595 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 616 if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
596 long n = simple_strtol(cmdline, NULL, 0); 617 u64 size;
597
598 num_nodes = split_nodes_interleave(addr, max_addr,
599 num_phys_nodes, n);
600 if (num_nodes < 0)
601 return num_nodes;
602 goto out;
603 }
604 618
605 /* Parse the command line. */ 619 size = memparse(cmdline, &cmdline);
606 for (coeff_flag = 0; ; cmdline++) { 620 num_nodes = split_nodes_size_interleave(addr, max_addr, size);
607 if (*cmdline && isdigit(*cmdline)) { 621 } else {
608 num = num * 10 + *cmdline - '0'; 622 unsigned long n;
609 continue; 623
610 } 624 n = simple_strtoul(cmdline, NULL, 0);
611 if (*cmdline == '*') { 625 num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
612 if (num > 0)
613 coeff = num;
614 coeff_flag = 1;
615 }
616 if (!*cmdline || *cmdline == ',') {
617 if (!coeff_flag)
618 coeff = 1;
619 /*
620 * Round down to the nearest FAKE_NODE_MIN_SIZE.
621 * Command-line coefficients are in megabytes.
622 */
623 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
624 if (size)
625 for (i = 0; i < coeff; i++, num_nodes++)
626 if (setup_node_range(num_nodes, &addr,
627 size, max_addr) < 0)
628 goto done;
629 if (!*cmdline)
630 break;
631 coeff_flag = 0;
632 coeff = -1;
633 }
634 num = 0;
635 }
636done:
637 if (!num_nodes)
638 return -1;
639 /* Fill remainder of system RAM, if appropriate. */
640 if (addr < max_addr) {
641 if (coeff_flag && coeff < 0) {
642 /* Split remaining nodes into num-sized chunks */
643 num_nodes += split_nodes_by_size(&addr, max_addr,
644 num_nodes, num);
645 goto out;
646 }
647 switch (*(cmdline - 1)) {
648 case '*':
649 /* Split remaining nodes into coeff chunks */
650 if (coeff <= 0)
651 break;
652 num_nodes += split_nodes_equally(&addr, max_addr,
653 num_nodes, coeff);
654 break;
655 case ',':
656 /* Do not allocate remaining system RAM */
657 break;
658 default:
659 /* Give one final node */
660 setup_node_range(num_nodes, &addr, max_addr - addr,
661 max_addr);
662 num_nodes++;
663 }
664 } 626 }
665out: 627
628 if (num_nodes < 0)
629 return num_nodes;
666 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); 630 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
667 if (memnode_shift < 0) { 631 if (memnode_shift < 0) {
668 memnode_shift = 0; 632 memnode_shift = 0;
@@ -742,6 +706,10 @@ unsigned long __init numa_free_all_bootmem(void)
742 for_each_online_node(i) 706 for_each_online_node(i)
743 pages += free_all_bootmem_node(NODE_DATA(i)); 707 pages += free_all_bootmem_node(NODE_DATA(i));
744 708
709#ifdef CONFIG_NO_BOOTMEM
710 pages += free_all_memory_core_early(MAX_NUMNODES);
711#endif
712
745 return pages; 713 return pages;
746} 714}
747 715
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ed34f5e35999..c9ba9deafe83 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -6,6 +6,14 @@
6 6
7#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO 7#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
8 8
9#ifdef CONFIG_HIGHPTE
10#define PGALLOC_USER_GFP __GFP_HIGHMEM
11#else
12#define PGALLOC_USER_GFP 0
13#endif
14
15gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
16
9pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 17pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
10{ 18{
11 return (pte_t *)__get_free_page(PGALLOC_GFP); 19 return (pte_t *)__get_free_page(PGALLOC_GFP);
@@ -15,16 +23,29 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
15{ 23{
16 struct page *pte; 24 struct page *pte;
17 25
18#ifdef CONFIG_HIGHPTE 26 pte = alloc_pages(__userpte_alloc_gfp, 0);
19 pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0);
20#else
21 pte = alloc_pages(PGALLOC_GFP, 0);
22#endif
23 if (pte) 27 if (pte)
24 pgtable_page_ctor(pte); 28 pgtable_page_ctor(pte);
25 return pte; 29 return pte;
26} 30}
27 31
32static int __init setup_userpte(char *arg)
33{
34 if (!arg)
35 return -EINVAL;
36
37 /*
38 * "userpte=nohigh" disables allocation of user pagetables in
39 * high memory.
40 */
41 if (strcmp(arg, "nohigh") == 0)
42 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
43 else
44 return -EINVAL;
45 return 0;
46}
47early_param("userpte", setup_userpte);
48
28void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) 49void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
29{ 50{
30 pgtable_page_dtor(pte); 51 pgtable_page_dtor(pte);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 65b58e4b0b8b..426f3a1a64d3 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -41,7 +41,7 @@ union smp_flush_state {
41 struct { 41 struct {
42 struct mm_struct *flush_mm; 42 struct mm_struct *flush_mm;
43 unsigned long flush_va; 43 unsigned long flush_va;
44 spinlock_t tlbstate_lock; 44 raw_spinlock_t tlbstate_lock;
45 DECLARE_BITMAP(flush_cpumask, NR_CPUS); 45 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
46 }; 46 };
47 char pad[INTERNODE_CACHE_BYTES]; 47 char pad[INTERNODE_CACHE_BYTES];
@@ -181,7 +181,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
181 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is 181 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
182 * probably not worth checking this for a cache-hot lock. 182 * probably not worth checking this for a cache-hot lock.
183 */ 183 */
184 spin_lock(&f->tlbstate_lock); 184 raw_spin_lock(&f->tlbstate_lock);
185 185
186 f->flush_mm = mm; 186 f->flush_mm = mm;
187 f->flush_va = va; 187 f->flush_va = va;
@@ -199,7 +199,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
199 199
200 f->flush_mm = NULL; 200 f->flush_mm = NULL;
201 f->flush_va = 0; 201 f->flush_va = 0;
202 spin_unlock(&f->tlbstate_lock); 202 raw_spin_unlock(&f->tlbstate_lock);
203} 203}
204 204
205void native_flush_tlb_others(const struct cpumask *cpumask, 205void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -223,7 +223,7 @@ static int __cpuinit init_smp_flush(void)
223 int i; 223 int i;
224 224
225 for (i = 0; i < ARRAY_SIZE(flush_state); i++) 225 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
226 spin_lock_init(&flush_state[i].tlbstate_lock); 226 raw_spin_lock_init(&flush_state[i].tlbstate_lock);
227 227
228 return 0; 228 return 0;
229} 229}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 3347f696edc7..2c505ee71014 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -159,7 +159,7 @@ static int nmi_setup_mux(void)
159 159
160 for_each_possible_cpu(i) { 160 for_each_possible_cpu(i) {
161 per_cpu(cpu_msrs, i).multiplex = 161 per_cpu(cpu_msrs, i).multiplex =
162 kmalloc(multiplex_size, GFP_KERNEL); 162 kzalloc(multiplex_size, GFP_KERNEL);
163 if (!per_cpu(cpu_msrs, i).multiplex) 163 if (!per_cpu(cpu_msrs, i).multiplex)
164 return 0; 164 return 0;
165 } 165 }
@@ -179,7 +179,6 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
179 if (counter_config[i].enabled) { 179 if (counter_config[i].enabled) {
180 multiplex[i].saved = -(u64)counter_config[i].count; 180 multiplex[i].saved = -(u64)counter_config[i].count;
181 } else { 181 } else {
182 multiplex[i].addr = 0;
183 multiplex[i].saved = 0; 182 multiplex[i].saved = 0;
184 } 183 }
185 } 184 }
@@ -189,25 +188,27 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
189 188
190static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) 189static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs)
191{ 190{
191 struct op_msr *counters = msrs->counters;
192 struct op_msr *multiplex = msrs->multiplex; 192 struct op_msr *multiplex = msrs->multiplex;
193 int i; 193 int i;
194 194
195 for (i = 0; i < model->num_counters; ++i) { 195 for (i = 0; i < model->num_counters; ++i) {
196 int virt = op_x86_phys_to_virt(i); 196 int virt = op_x86_phys_to_virt(i);
197 if (multiplex[virt].addr) 197 if (counters[i].addr)
198 rdmsrl(multiplex[virt].addr, multiplex[virt].saved); 198 rdmsrl(counters[i].addr, multiplex[virt].saved);
199 } 199 }
200} 200}
201 201
202static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) 202static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs)
203{ 203{
204 struct op_msr *counters = msrs->counters;
204 struct op_msr *multiplex = msrs->multiplex; 205 struct op_msr *multiplex = msrs->multiplex;
205 int i; 206 int i;
206 207
207 for (i = 0; i < model->num_counters; ++i) { 208 for (i = 0; i < model->num_counters; ++i) {
208 int virt = op_x86_phys_to_virt(i); 209 int virt = op_x86_phys_to_virt(i);
209 if (multiplex[virt].addr) 210 if (counters[i].addr)
210 wrmsrl(multiplex[virt].addr, multiplex[virt].saved); 211 wrmsrl(counters[i].addr, multiplex[virt].saved);
211 } 212 }
212} 213}
213 214
@@ -303,11 +304,11 @@ static int allocate_msrs(void)
303 304
304 int i; 305 int i;
305 for_each_possible_cpu(i) { 306 for_each_possible_cpu(i) {
306 per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, 307 per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
307 GFP_KERNEL); 308 GFP_KERNEL);
308 if (!per_cpu(cpu_msrs, i).counters) 309 if (!per_cpu(cpu_msrs, i).counters)
309 return 0; 310 return 0;
310 per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, 311 per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
311 GFP_KERNEL); 312 GFP_KERNEL);
312 if (!per_cpu(cpu_msrs, i).controls) 313 if (!per_cpu(cpu_msrs, i).controls)
313 return 0; 314 return 0;
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 39686c29f03a..6a58256dce9f 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -22,6 +22,9 @@
22#include <asm/ptrace.h> 22#include <asm/ptrace.h>
23#include <asm/msr.h> 23#include <asm/msr.h>
24#include <asm/nmi.h> 24#include <asm/nmi.h>
25#include <asm/apic.h>
26#include <asm/processor.h>
27#include <asm/cpufeature.h>
25 28
26#include "op_x86_model.h" 29#include "op_x86_model.h"
27#include "op_counter.h" 30#include "op_counter.h"
@@ -43,15 +46,13 @@
43 46
44static unsigned long reset_value[NUM_VIRT_COUNTERS]; 47static unsigned long reset_value[NUM_VIRT_COUNTERS];
45 48
46#ifdef CONFIG_OPROFILE_IBS
47
48/* IbsFetchCtl bits/masks */ 49/* IbsFetchCtl bits/masks */
49#define IBS_FETCH_RAND_EN (1ULL<<57) 50#define IBS_FETCH_RAND_EN (1ULL<<57)
50#define IBS_FETCH_VAL (1ULL<<49) 51#define IBS_FETCH_VAL (1ULL<<49)
51#define IBS_FETCH_ENABLE (1ULL<<48) 52#define IBS_FETCH_ENABLE (1ULL<<48)
52#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL 53#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL
53 54
54/*IbsOpCtl bits */ 55/* IbsOpCtl bits */
55#define IBS_OP_CNT_CTL (1ULL<<19) 56#define IBS_OP_CNT_CTL (1ULL<<19)
56#define IBS_OP_VAL (1ULL<<18) 57#define IBS_OP_VAL (1ULL<<18)
57#define IBS_OP_ENABLE (1ULL<<17) 58#define IBS_OP_ENABLE (1ULL<<17)
@@ -59,7 +60,7 @@ static unsigned long reset_value[NUM_VIRT_COUNTERS];
59#define IBS_FETCH_SIZE 6 60#define IBS_FETCH_SIZE 6
60#define IBS_OP_SIZE 12 61#define IBS_OP_SIZE 12
61 62
62static int has_ibs; /* AMD Family10h and later */ 63static u32 ibs_caps;
63 64
64struct op_ibs_config { 65struct op_ibs_config {
65 unsigned long op_enabled; 66 unsigned long op_enabled;
@@ -71,24 +72,52 @@ struct op_ibs_config {
71}; 72};
72 73
73static struct op_ibs_config ibs_config; 74static struct op_ibs_config ibs_config;
75static u64 ibs_op_ctl;
74 76
75#endif 77/*
78 * IBS cpuid feature detection
79 */
76 80
77#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX 81#define IBS_CPUID_FEATURES 0x8000001b
82
83/*
84 * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
85 * bit 0 is used to indicate the existence of IBS.
86 */
87#define IBS_CAPS_AVAIL (1LL<<0)
88#define IBS_CAPS_RDWROPCNT (1LL<<3)
89#define IBS_CAPS_OPCNT (1LL<<4)
90
91/*
92 * IBS randomization macros
93 */
94#define IBS_RANDOM_BITS 12
95#define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1)
96#define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5))
78 97
79static void op_mux_fill_in_addresses(struct op_msrs * const msrs) 98static u32 get_ibs_caps(void)
80{ 99{
81 int i; 100 u32 ibs_caps;
101 unsigned int max_level;
82 102
83 for (i = 0; i < NUM_VIRT_COUNTERS; i++) { 103 if (!boot_cpu_has(X86_FEATURE_IBS))
84 int hw_counter = op_x86_virt_to_phys(i); 104 return 0;
85 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) 105
86 msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; 106 /* check IBS cpuid feature flags */
87 else 107 max_level = cpuid_eax(0x80000000);
88 msrs->multiplex[i].addr = 0; 108 if (max_level < IBS_CPUID_FEATURES)
89 } 109 return IBS_CAPS_AVAIL;
110
111 ibs_caps = cpuid_eax(IBS_CPUID_FEATURES);
112 if (!(ibs_caps & IBS_CAPS_AVAIL))
113 /* cpuid flags not valid */
114 return IBS_CAPS_AVAIL;
115
116 return ibs_caps;
90} 117}
91 118
119#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
120
92static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, 121static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
93 struct op_msrs const * const msrs) 122 struct op_msrs const * const msrs)
94{ 123{
@@ -98,7 +127,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
98 /* enable active counters */ 127 /* enable active counters */
99 for (i = 0; i < NUM_COUNTERS; ++i) { 128 for (i = 0; i < NUM_COUNTERS; ++i) {
100 int virt = op_x86_phys_to_virt(i); 129 int virt = op_x86_phys_to_virt(i);
101 if (!counter_config[virt].enabled) 130 if (!reset_value[virt])
102 continue; 131 continue;
103 rdmsrl(msrs->controls[i].addr, val); 132 rdmsrl(msrs->controls[i].addr, val);
104 val &= model->reserved; 133 val &= model->reserved;
@@ -107,10 +136,6 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
107 } 136 }
108} 137}
109 138
110#else
111
112static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { }
113
114#endif 139#endif
115 140
116/* functions for op_amd_spec */ 141/* functions for op_amd_spec */
@@ -122,18 +147,12 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
122 for (i = 0; i < NUM_COUNTERS; i++) { 147 for (i = 0; i < NUM_COUNTERS; i++) {
123 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) 148 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
124 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; 149 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
125 else
126 msrs->counters[i].addr = 0;
127 } 150 }
128 151
129 for (i = 0; i < NUM_CONTROLS; i++) { 152 for (i = 0; i < NUM_CONTROLS; i++) {
130 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) 153 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
131 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; 154 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
132 else
133 msrs->controls[i].addr = 0;
134 } 155 }
135
136 op_mux_fill_in_addresses(msrs);
137} 156}
138 157
139static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, 158static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
@@ -144,7 +163,8 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
144 163
145 /* setup reset_value */ 164 /* setup reset_value */
146 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { 165 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
147 if (counter_config[i].enabled) 166 if (counter_config[i].enabled
167 && msrs->counters[op_x86_virt_to_phys(i)].addr)
148 reset_value[i] = counter_config[i].count; 168 reset_value[i] = counter_config[i].count;
149 else 169 else
150 reset_value[i] = 0; 170 reset_value[i] = 0;
@@ -152,9 +172,18 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
152 172
153 /* clear all counters */ 173 /* clear all counters */
154 for (i = 0; i < NUM_CONTROLS; ++i) { 174 for (i = 0; i < NUM_CONTROLS; ++i) {
155 if (unlikely(!msrs->controls[i].addr)) 175 if (unlikely(!msrs->controls[i].addr)) {
176 if (counter_config[i].enabled && !smp_processor_id())
177 /*
178 * counter is reserved, this is on all
179 * cpus, so report only for cpu #0
180 */
181 op_x86_warn_reserved(i);
156 continue; 182 continue;
183 }
157 rdmsrl(msrs->controls[i].addr, val); 184 rdmsrl(msrs->controls[i].addr, val);
185 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
186 op_x86_warn_in_use(i);
158 val &= model->reserved; 187 val &= model->reserved;
159 wrmsrl(msrs->controls[i].addr, val); 188 wrmsrl(msrs->controls[i].addr, val);
160 } 189 }
@@ -169,9 +198,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
169 /* enable active counters */ 198 /* enable active counters */
170 for (i = 0; i < NUM_COUNTERS; ++i) { 199 for (i = 0; i < NUM_COUNTERS; ++i) {
171 int virt = op_x86_phys_to_virt(i); 200 int virt = op_x86_phys_to_virt(i);
172 if (!counter_config[virt].enabled) 201 if (!reset_value[virt])
173 continue;
174 if (!msrs->counters[i].addr)
175 continue; 202 continue;
176 203
177 /* setup counter registers */ 204 /* setup counter registers */
@@ -185,7 +212,60 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
185 } 212 }
186} 213}
187 214
188#ifdef CONFIG_OPROFILE_IBS 215/*
216 * 16-bit Linear Feedback Shift Register (LFSR)
217 *
218 * 16 14 13 11
219 * Feedback polynomial = X + X + X + X + 1
220 */
221static unsigned int lfsr_random(void)
222{
223 static unsigned int lfsr_value = 0xF00D;
224 unsigned int bit;
225
226 /* Compute next bit to shift in */
227 bit = ((lfsr_value >> 0) ^
228 (lfsr_value >> 2) ^
229 (lfsr_value >> 3) ^
230 (lfsr_value >> 5)) & 0x0001;
231
232 /* Advance to next register value */
233 lfsr_value = (lfsr_value >> 1) | (bit << 15);
234
235 return lfsr_value;
236}
237
238/*
239 * IBS software randomization
240 *
241 * The IBS periodic op counter is randomized in software. The lower 12
242 * bits of the 20 bit counter are randomized. IbsOpCurCnt is
243 * initialized with a 12 bit random value.
244 */
245static inline u64 op_amd_randomize_ibs_op(u64 val)
246{
247 unsigned int random = lfsr_random();
248
249 if (!(ibs_caps & IBS_CAPS_RDWROPCNT))
250 /*
251 * Work around if the hw can not write to IbsOpCurCnt
252 *
253 * Randomize the lower 8 bits of the 16 bit
254 * IbsOpMaxCnt [15:0] value in the range of -128 to
255 * +127 by adding/subtracting an offset to the
256 * maximum count (IbsOpMaxCnt).
257 *
258 * To avoid over or underflows and protect upper bits
259 * starting at bit 16, the initial value for
260 * IbsOpMaxCnt must fit in the range from 0x0081 to
261 * 0xff80.
262 */
263 val += (s8)(random >> 4);
264 else
265 val |= (u64)(random & IBS_RANDOM_MASK) << 32;
266
267 return val;
268}
189 269
190static inline void 270static inline void
191op_amd_handle_ibs(struct pt_regs * const regs, 271op_amd_handle_ibs(struct pt_regs * const regs,
@@ -194,7 +274,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
194 u64 val, ctl; 274 u64 val, ctl;
195 struct op_entry entry; 275 struct op_entry entry;
196 276
197 if (!has_ibs) 277 if (!ibs_caps)
198 return; 278 return;
199 279
200 if (ibs_config.fetch_enabled) { 280 if (ibs_config.fetch_enabled) {
@@ -236,8 +316,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
236 oprofile_write_commit(&entry); 316 oprofile_write_commit(&entry);
237 317
238 /* reenable the IRQ */ 318 /* reenable the IRQ */
239 ctl &= ~IBS_OP_VAL & 0xFFFFFFFF; 319 ctl = op_amd_randomize_ibs_op(ibs_op_ctl);
240 ctl |= IBS_OP_ENABLE;
241 wrmsrl(MSR_AMD64_IBSOPCTL, ctl); 320 wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
242 } 321 }
243 } 322 }
@@ -246,41 +325,57 @@ op_amd_handle_ibs(struct pt_regs * const regs,
246static inline void op_amd_start_ibs(void) 325static inline void op_amd_start_ibs(void)
247{ 326{
248 u64 val; 327 u64 val;
249 if (has_ibs && ibs_config.fetch_enabled) { 328
329 if (!ibs_caps)
330 return;
331
332 if (ibs_config.fetch_enabled) {
250 val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; 333 val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF;
251 val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; 334 val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
252 val |= IBS_FETCH_ENABLE; 335 val |= IBS_FETCH_ENABLE;
253 wrmsrl(MSR_AMD64_IBSFETCHCTL, val); 336 wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
254 } 337 }
255 338
256 if (has_ibs && ibs_config.op_enabled) { 339 if (ibs_config.op_enabled) {
257 val = (ibs_config.max_cnt_op >> 4) & 0xFFFF; 340 ibs_op_ctl = ibs_config.max_cnt_op >> 4;
258 val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; 341 if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) {
259 val |= IBS_OP_ENABLE; 342 /*
343 * IbsOpCurCnt not supported. See
344 * op_amd_randomize_ibs_op() for details.
345 */
346 ibs_op_ctl = clamp(ibs_op_ctl, 0x0081ULL, 0xFF80ULL);
347 } else {
348 /*
349 * The start value is randomized with a
350 * positive offset, we need to compensate it
351 * with the half of the randomized range. Also
352 * avoid underflows.
353 */
354 ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET,
355 0xFFFFULL);
356 }
357 if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops)
358 ibs_op_ctl |= IBS_OP_CNT_CTL;
359 ibs_op_ctl |= IBS_OP_ENABLE;
360 val = op_amd_randomize_ibs_op(ibs_op_ctl);
260 wrmsrl(MSR_AMD64_IBSOPCTL, val); 361 wrmsrl(MSR_AMD64_IBSOPCTL, val);
261 } 362 }
262} 363}
263 364
264static void op_amd_stop_ibs(void) 365static void op_amd_stop_ibs(void)
265{ 366{
266 if (has_ibs && ibs_config.fetch_enabled) 367 if (!ibs_caps)
368 return;
369
370 if (ibs_config.fetch_enabled)
267 /* clear max count and enable */ 371 /* clear max count and enable */
268 wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); 372 wrmsrl(MSR_AMD64_IBSFETCHCTL, 0);
269 373
270 if (has_ibs && ibs_config.op_enabled) 374 if (ibs_config.op_enabled)
271 /* clear max count and enable */ 375 /* clear max count and enable */
272 wrmsrl(MSR_AMD64_IBSOPCTL, 0); 376 wrmsrl(MSR_AMD64_IBSOPCTL, 0);
273} 377}
274 378
275#else
276
277static inline void op_amd_handle_ibs(struct pt_regs * const regs,
278 struct op_msrs const * const msrs) { }
279static inline void op_amd_start_ibs(void) { }
280static inline void op_amd_stop_ibs(void) { }
281
282#endif
283
284static int op_amd_check_ctrs(struct pt_regs * const regs, 379static int op_amd_check_ctrs(struct pt_regs * const regs,
285 struct op_msrs const * const msrs) 380 struct op_msrs const * const msrs)
286{ 381{
@@ -355,8 +450,6 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
355 } 450 }
356} 451}
357 452
358#ifdef CONFIG_OPROFILE_IBS
359
360static u8 ibs_eilvt_off; 453static u8 ibs_eilvt_off;
361 454
362static inline void apic_init_ibs_nmi_per_cpu(void *arg) 455static inline void apic_init_ibs_nmi_per_cpu(void *arg)
@@ -405,45 +498,36 @@ static int init_ibs_nmi(void)
405 return 1; 498 return 1;
406 } 499 }
407 500
408#ifdef CONFIG_NUMA
409 /* Sanity check */
410 /* Works only for 64bit with proper numa implementation. */
411 if (nodes != num_possible_nodes()) {
412 printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, "
413 "found: %d, expected %d",
414 nodes, num_possible_nodes());
415 return 1;
416 }
417#endif
418 return 0; 501 return 0;
419} 502}
420 503
421/* uninitialize the APIC for the IBS interrupts if needed */ 504/* uninitialize the APIC for the IBS interrupts if needed */
422static void clear_ibs_nmi(void) 505static void clear_ibs_nmi(void)
423{ 506{
424 if (has_ibs) 507 if (ibs_caps)
425 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); 508 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
426} 509}
427 510
428/* initialize the APIC for the IBS interrupts if available */ 511/* initialize the APIC for the IBS interrupts if available */
429static void ibs_init(void) 512static void ibs_init(void)
430{ 513{
431 has_ibs = boot_cpu_has(X86_FEATURE_IBS); 514 ibs_caps = get_ibs_caps();
432 515
433 if (!has_ibs) 516 if (!ibs_caps)
434 return; 517 return;
435 518
436 if (init_ibs_nmi()) { 519 if (init_ibs_nmi()) {
437 has_ibs = 0; 520 ibs_caps = 0;
438 return; 521 return;
439 } 522 }
440 523
441 printk(KERN_INFO "oprofile: AMD IBS detected\n"); 524 printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n",
525 (unsigned)ibs_caps);
442} 526}
443 527
444static void ibs_exit(void) 528static void ibs_exit(void)
445{ 529{
446 if (!has_ibs) 530 if (!ibs_caps)
447 return; 531 return;
448 532
449 clear_ibs_nmi(); 533 clear_ibs_nmi();
@@ -463,7 +547,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
463 if (ret) 547 if (ret)
464 return ret; 548 return ret;
465 549
466 if (!has_ibs) 550 if (!ibs_caps)
467 return ret; 551 return ret;
468 552
469 /* model specific files */ 553 /* model specific files */
@@ -473,7 +557,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
473 ibs_config.fetch_enabled = 0; 557 ibs_config.fetch_enabled = 0;
474 ibs_config.max_cnt_op = 250000; 558 ibs_config.max_cnt_op = 250000;
475 ibs_config.op_enabled = 0; 559 ibs_config.op_enabled = 0;
476 ibs_config.dispatched_ops = 1; 560 ibs_config.dispatched_ops = 0;
477 561
478 dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); 562 dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
479 oprofilefs_create_ulong(sb, dir, "enable", 563 oprofilefs_create_ulong(sb, dir, "enable",
@@ -488,8 +572,9 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
488 &ibs_config.op_enabled); 572 &ibs_config.op_enabled);
489 oprofilefs_create_ulong(sb, dir, "max_count", 573 oprofilefs_create_ulong(sb, dir, "max_count",
490 &ibs_config.max_cnt_op); 574 &ibs_config.max_cnt_op);
491 oprofilefs_create_ulong(sb, dir, "dispatched_ops", 575 if (ibs_caps & IBS_CAPS_OPCNT)
492 &ibs_config.dispatched_ops); 576 oprofilefs_create_ulong(sb, dir, "dispatched_ops",
577 &ibs_config.dispatched_ops);
493 578
494 return 0; 579 return 0;
495} 580}
@@ -507,19 +592,6 @@ static void op_amd_exit(void)
507 ibs_exit(); 592 ibs_exit();
508} 593}
509 594
510#else
511
512/* no IBS support */
513
514static int op_amd_init(struct oprofile_operations *ops)
515{
516 return 0;
517}
518
519static void op_amd_exit(void) {}
520
521#endif /* CONFIG_OPROFILE_IBS */
522
523struct op_x86_model_spec op_amd_spec = { 595struct op_x86_model_spec op_amd_spec = {
524 .num_counters = NUM_COUNTERS, 596 .num_counters = NUM_COUNTERS,
525 .num_controls = NUM_CONTROLS, 597 .num_controls = NUM_CONTROLS,
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index ac6b354becdf..e6a160a4684a 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -394,12 +394,6 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
394 setup_num_counters(); 394 setup_num_counters();
395 stag = get_stagger(); 395 stag = get_stagger();
396 396
397 /* initialize some registers */
398 for (i = 0; i < num_counters; ++i)
399 msrs->counters[i].addr = 0;
400 for (i = 0; i < num_controls; ++i)
401 msrs->controls[i].addr = 0;
402
403 /* the counter & cccr registers we pay attention to */ 397 /* the counter & cccr registers we pay attention to */
404 for (i = 0; i < num_counters; ++i) { 398 for (i = 0; i < num_counters; ++i) {
405 addr = p4_counters[VIRT_CTR(stag, i)].counter_address; 399 addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 8eb05878554c..5d1727ba409e 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -37,15 +37,11 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs)
37 for (i = 0; i < num_counters; i++) { 37 for (i = 0; i < num_counters; i++) {
38 if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) 38 if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
39 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; 39 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
40 else
41 msrs->counters[i].addr = 0;
42 } 40 }
43 41
44 for (i = 0; i < num_counters; i++) { 42 for (i = 0; i < num_counters; i++) {
45 if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) 43 if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i))
46 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; 44 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
47 else
48 msrs->controls[i].addr = 0;
49 } 45 }
50} 46}
51 47
@@ -57,7 +53,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
57 int i; 53 int i;
58 54
59 if (!reset_value) { 55 if (!reset_value) {
60 reset_value = kmalloc(sizeof(reset_value[0]) * num_counters, 56 reset_value = kzalloc(sizeof(reset_value[0]) * num_counters,
61 GFP_ATOMIC); 57 GFP_ATOMIC);
62 if (!reset_value) 58 if (!reset_value)
63 return; 59 return;
@@ -82,9 +78,18 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
82 78
83 /* clear all counters */ 79 /* clear all counters */
84 for (i = 0; i < num_counters; ++i) { 80 for (i = 0; i < num_counters; ++i) {
85 if (unlikely(!msrs->controls[i].addr)) 81 if (unlikely(!msrs->controls[i].addr)) {
82 if (counter_config[i].enabled && !smp_processor_id())
83 /*
84 * counter is reserved, this is on all
85 * cpus, so report only for cpu #0
86 */
87 op_x86_warn_reserved(i);
86 continue; 88 continue;
89 }
87 rdmsrl(msrs->controls[i].addr, val); 90 rdmsrl(msrs->controls[i].addr, val);
91 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
92 op_x86_warn_in_use(i);
88 val &= model->reserved; 93 val &= model->reserved;
89 wrmsrl(msrs->controls[i].addr, val); 94 wrmsrl(msrs->controls[i].addr, val);
90 } 95 }
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 7b8e75d16081..ff82a755edd4 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -57,6 +57,26 @@ struct op_x86_model_spec {
57 57
58struct op_counter_config; 58struct op_counter_config;
59 59
60static inline void op_x86_warn_in_use(int counter)
61{
62 /*
63 * The warning indicates an already running counter. If
64 * oprofile doesn't collect data, then try using a different
65 * performance counter on your platform to monitor the desired
66 * event. Delete counter #%d from the desired event by editing
67 * the /usr/share/oprofile/%s/<cpu>/events file. If the event
68 * cannot be monitored by any other counter, contact your
69 * hardware or BIOS vendor.
70 */
71 pr_warning("oprofile: counter #%d on cpu #%d may already be used\n",
72 counter, smp_processor_id());
73}
74
75static inline void op_x86_warn_reserved(int counter)
76{
77 pr_warning("oprofile: counter #%d is already reserved\n", counter);
78}
79
60extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, 80extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
61 struct op_counter_config *counter_config); 81 struct op_counter_config *counter_config);
62extern int op_x86_phys_to_virt(int phys); 82extern int op_x86_phys_to_virt(int phys);
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 56caf2a18baa..b110d97fb925 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -16,8 +16,7 @@ obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
16obj-$(CONFIG_X86_MRST) += mrst.o 16obj-$(CONFIG_X86_MRST) += mrst.o
17 17
18obj-y += common.o early.o 18obj-y += common.o early.o
19obj-y += amd_bus.o 19obj-y += amd_bus.o bus_numa.o
20obj-$(CONFIG_X86_64) += bus_numa.o
21 20
22ifeq ($(CONFIG_PCI_DEBUG),y) 21ifeq ($(CONFIG_PCI_DEBUG),y)
23EXTRA_CFLAGS += -DDEBUG 22EXTRA_CFLAGS += -DDEBUG
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index b53f0487e2d3..6e22454bfaa6 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -15,6 +15,51 @@ struct pci_root_info {
15 int busnum; 15 int busnum;
16}; 16};
17 17
18static bool pci_use_crs = true;
19
20static int __init set_use_crs(const struct dmi_system_id *id)
21{
22 pci_use_crs = true;
23 return 0;
24}
25
26static const struct dmi_system_id pci_use_crs_table[] __initconst = {
27 /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */
28 {
29 .callback = set_use_crs,
30 .ident = "IBM System x3800",
31 .matches = {
32 DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
33 DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
34 },
35 },
36 {}
37};
38
39void __init pci_acpi_crs_quirks(void)
40{
41 int year;
42
43 if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008)
44 pci_use_crs = false;
45
46 dmi_check_system(pci_use_crs_table);
47
48 /*
49 * If the user specifies "pci=use_crs" or "pci=nocrs" explicitly, that
50 * takes precedence over anything we figured out above.
51 */
52 if (pci_probe & PCI_ROOT_NO_CRS)
53 pci_use_crs = false;
54 else if (pci_probe & PCI_USE__CRS)
55 pci_use_crs = true;
56
57 printk(KERN_INFO "PCI: %s host bridge windows from ACPI; "
58 "if necessary, use \"pci=%s\" and report a bug\n",
59 pci_use_crs ? "Using" : "Ignoring",
60 pci_use_crs ? "nocrs" : "use_crs");
61}
62
18static acpi_status 63static acpi_status
19resource_to_addr(struct acpi_resource *resource, 64resource_to_addr(struct acpi_resource *resource,
20 struct acpi_resource_address64 *addr) 65 struct acpi_resource_address64 *addr)
@@ -45,20 +90,6 @@ count_resource(struct acpi_resource *acpi_res, void *data)
45 return AE_OK; 90 return AE_OK;
46} 91}
47 92
48static int
49bus_has_transparent_bridge(struct pci_bus *bus)
50{
51 struct pci_dev *dev;
52
53 list_for_each_entry(dev, &bus->devices, bus_list) {
54 u16 class = dev->class >> 8;
55
56 if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent)
57 return true;
58 }
59 return false;
60}
61
62static void 93static void
63align_resource(struct acpi_device *bridge, struct resource *res) 94align_resource(struct acpi_device *bridge, struct resource *res)
64{ 95{
@@ -92,12 +123,8 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
92 acpi_status status; 123 acpi_status status;
93 unsigned long flags; 124 unsigned long flags;
94 struct resource *root; 125 struct resource *root;
95 int max_root_bus_resources = PCI_BUS_NUM_RESOURCES;
96 u64 start, end; 126 u64 start, end;
97 127
98 if (bus_has_transparent_bridge(info->bus))
99 max_root_bus_resources -= 3;
100
101 status = resource_to_addr(acpi_res, &addr); 128 status = resource_to_addr(acpi_res, &addr);
102 if (!ACPI_SUCCESS(status)) 129 if (!ACPI_SUCCESS(status))
103 return AE_OK; 130 return AE_OK;
@@ -115,15 +142,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
115 142
116 start = addr.minimum + addr.translation_offset; 143 start = addr.minimum + addr.translation_offset;
117 end = start + addr.address_length - 1; 144 end = start + addr.address_length - 1;
118 if (info->res_num >= max_root_bus_resources) {
119 if (pci_probe & PCI_USE__CRS)
120 printk(KERN_WARNING "PCI: Failed to allocate "
121 "0x%lx-0x%lx from %s for %s due to _CRS "
122 "returning more than %d resource descriptors\n",
123 (unsigned long) start, (unsigned long) end,
124 root->name, info->name, max_root_bus_resources);
125 return AE_OK;
126 }
127 145
128 res = &info->res[info->res_num]; 146 res = &info->res[info->res_num];
129 res->name = info->name; 147 res->name = info->name;
@@ -133,7 +151,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
133 res->child = NULL; 151 res->child = NULL;
134 align_resource(info->bridge, res); 152 align_resource(info->bridge, res);
135 153
136 if (!(pci_probe & PCI_USE__CRS)) { 154 if (!pci_use_crs) {
137 dev_printk(KERN_DEBUG, &info->bridge->dev, 155 dev_printk(KERN_DEBUG, &info->bridge->dev,
138 "host bridge window %pR (ignored)\n", res); 156 "host bridge window %pR (ignored)\n", res);
139 return AE_OK; 157 return AE_OK;
@@ -143,7 +161,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
143 dev_err(&info->bridge->dev, 161 dev_err(&info->bridge->dev,
144 "can't allocate host bridge window %pR\n", res); 162 "can't allocate host bridge window %pR\n", res);
145 } else { 163 } else {
146 info->bus->resource[info->res_num] = res; 164 pci_bus_add_resource(info->bus, res, 0);
147 info->res_num++; 165 info->res_num++;
148 if (addr.translation_offset) 166 if (addr.translation_offset)
149 dev_info(&info->bridge->dev, "host bridge window %pR " 167 dev_info(&info->bridge->dev, "host bridge window %pR "
@@ -164,10 +182,8 @@ get_current_resources(struct acpi_device *device, int busnum,
164 struct pci_root_info info; 182 struct pci_root_info info;
165 size_t size; 183 size_t size;
166 184
167 if (!(pci_probe & PCI_USE__CRS)) 185 if (pci_use_crs)
168 dev_info(&device->dev, 186 pci_bus_remove_resources(bus);
169 "ignoring host bridge windows from ACPI; "
170 "boot with \"pci=use_crs\" to use them\n");
171 187
172 info.bridge = device; 188 info.bridge = device;
173 info.bus = bus; 189 info.bus = bus;
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 95ecbd495955..fc1e8fe07e5c 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -2,11 +2,11 @@
2#include <linux/pci.h> 2#include <linux/pci.h>
3#include <linux/topology.h> 3#include <linux/topology.h>
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/range.h>
6
5#include <asm/pci_x86.h> 7#include <asm/pci_x86.h>
6 8
7#ifdef CONFIG_X86_64
8#include <asm/pci-direct.h> 9#include <asm/pci-direct.h>
9#endif
10 10
11#include "bus_numa.h" 11#include "bus_numa.h"
12 12
@@ -15,60 +15,6 @@
15 * also get peer root bus resource for io,mmio 15 * also get peer root bus resource for io,mmio
16 */ 16 */
17 17
18#ifdef CONFIG_X86_64
19
20#define RANGE_NUM 16
21
22struct res_range {
23 size_t start;
24 size_t end;
25};
26
27static void __init update_range(struct res_range *range, size_t start,
28 size_t end)
29{
30 int i;
31 int j;
32
33 for (j = 0; j < RANGE_NUM; j++) {
34 if (!range[j].end)
35 continue;
36
37 if (start <= range[j].start && end >= range[j].end) {
38 range[j].start = 0;
39 range[j].end = 0;
40 continue;
41 }
42
43 if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) {
44 range[j].start = end + 1;
45 continue;
46 }
47
48
49 if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) {
50 range[j].end = start - 1;
51 continue;
52 }
53
54 if (start > range[j].start && end < range[j].end) {
55 /* find the new spare */
56 for (i = 0; i < RANGE_NUM; i++) {
57 if (range[i].end == 0)
58 break;
59 }
60 if (i < RANGE_NUM) {
61 range[i].end = range[j].end;
62 range[i].start = end + 1;
63 } else {
64 printk(KERN_ERR "run of slot in ranges\n");
65 }
66 range[j].end = start - 1;
67 continue;
68 }
69 }
70}
71
72struct pci_hostbridge_probe { 18struct pci_hostbridge_probe {
73 u32 bus; 19 u32 bus;
74 u32 slot; 20 u32 slot;
@@ -111,6 +57,8 @@ static void __init get_pci_mmcfg_amd_fam10h_range(void)
111 fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1; 57 fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
112} 58}
113 59
60#define RANGE_NUM 16
61
114/** 62/**
115 * early_fill_mp_bus_to_node() 63 * early_fill_mp_bus_to_node()
116 * called before pcibios_scan_root and pci_scan_bus 64 * called before pcibios_scan_root and pci_scan_bus
@@ -130,16 +78,17 @@ static int __init early_fill_mp_bus_info(void)
130 struct pci_root_info *info; 78 struct pci_root_info *info;
131 u32 reg; 79 u32 reg;
132 struct resource *res; 80 struct resource *res;
133 size_t start; 81 u64 start;
134 size_t end; 82 u64 end;
135 struct res_range range[RANGE_NUM]; 83 struct range range[RANGE_NUM];
136 u64 val; 84 u64 val;
137 u32 address; 85 u32 address;
86 bool found;
138 87
139 if (!early_pci_allowed()) 88 if (!early_pci_allowed())
140 return -1; 89 return -1;
141 90
142 found_all_numa_early = 0; 91 found = false;
143 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { 92 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
144 u32 id; 93 u32 id;
145 u16 device; 94 u16 device;
@@ -153,12 +102,12 @@ static int __init early_fill_mp_bus_info(void)
153 device = (id>>16) & 0xffff; 102 device = (id>>16) & 0xffff;
154 if (pci_probes[i].vendor == vendor && 103 if (pci_probes[i].vendor == vendor &&
155 pci_probes[i].device == device) { 104 pci_probes[i].device == device) {
156 found_all_numa_early = 1; 105 found = true;
157 break; 106 break;
158 } 107 }
159 } 108 }
160 109
161 if (!found_all_numa_early) 110 if (!found)
162 return 0; 111 return 0;
163 112
164 pci_root_num = 0; 113 pci_root_num = 0;
@@ -196,7 +145,7 @@ static int __init early_fill_mp_bus_info(void)
196 def_link = (reg >> 8) & 0x03; 145 def_link = (reg >> 8) & 0x03;
197 146
198 memset(range, 0, sizeof(range)); 147 memset(range, 0, sizeof(range));
199 range[0].end = 0xffff; 148 add_range(range, RANGE_NUM, 0, 0, 0xffff + 1);
200 /* io port resource */ 149 /* io port resource */
201 for (i = 0; i < 4; i++) { 150 for (i = 0; i < 4; i++) {
202 reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3)); 151 reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3));
@@ -220,13 +169,13 @@ static int __init early_fill_mp_bus_info(void)
220 169
221 info = &pci_root_info[j]; 170 info = &pci_root_info[j];
222 printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n", 171 printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n",
223 node, link, (u64)start, (u64)end); 172 node, link, start, end);
224 173
225 /* kernel only handle 16 bit only */ 174 /* kernel only handle 16 bit only */
226 if (end > 0xffff) 175 if (end > 0xffff)
227 end = 0xffff; 176 end = 0xffff;
228 update_res(info, start, end, IORESOURCE_IO, 1); 177 update_res(info, start, end, IORESOURCE_IO, 1);
229 update_range(range, start, end); 178 subtract_range(range, RANGE_NUM, start, end + 1);
230 } 179 }
231 /* add left over io port range to def node/link, [0, 0xffff] */ 180 /* add left over io port range to def node/link, [0, 0xffff] */
232 /* find the position */ 181 /* find the position */
@@ -241,29 +190,32 @@ static int __init early_fill_mp_bus_info(void)
241 if (!range[i].end) 190 if (!range[i].end)
242 continue; 191 continue;
243 192
244 update_res(info, range[i].start, range[i].end, 193 update_res(info, range[i].start, range[i].end - 1,
245 IORESOURCE_IO, 1); 194 IORESOURCE_IO, 1);
246 } 195 }
247 } 196 }
248 197
249 memset(range, 0, sizeof(range)); 198 memset(range, 0, sizeof(range));
250 /* 0xfd00000000-0xffffffffff for HT */ 199 /* 0xfd00000000-0xffffffffff for HT */
251 range[0].end = (0xfdULL<<32) - 1; 200 end = cap_resource((0xfdULL<<32) - 1);
201 end++;
202 add_range(range, RANGE_NUM, 0, 0, end);
252 203
253 /* need to take out [0, TOM) for RAM*/ 204 /* need to take out [0, TOM) for RAM*/
254 address = MSR_K8_TOP_MEM1; 205 address = MSR_K8_TOP_MEM1;
255 rdmsrl(address, val); 206 rdmsrl(address, val);
256 end = (val & 0xffffff800000ULL); 207 end = (val & 0xffffff800000ULL);
257 printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); 208 printk(KERN_INFO "TOM: %016llx aka %lldM\n", end, end>>20);
258 if (end < (1ULL<<32)) 209 if (end < (1ULL<<32))
259 update_range(range, 0, end - 1); 210 subtract_range(range, RANGE_NUM, 0, end);
260 211
261 /* get mmconfig */ 212 /* get mmconfig */
262 get_pci_mmcfg_amd_fam10h_range(); 213 get_pci_mmcfg_amd_fam10h_range();
263 /* need to take out mmconf range */ 214 /* need to take out mmconf range */
264 if (fam10h_mmconf_end) { 215 if (fam10h_mmconf_end) {
265 printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); 216 printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end);
266 update_range(range, fam10h_mmconf_start, fam10h_mmconf_end); 217 subtract_range(range, RANGE_NUM, fam10h_mmconf_start,
218 fam10h_mmconf_end + 1);
267 } 219 }
268 220
269 /* mmio resource */ 221 /* mmio resource */
@@ -293,7 +245,7 @@ static int __init early_fill_mp_bus_info(void)
293 info = &pci_root_info[j]; 245 info = &pci_root_info[j];
294 246
295 printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]", 247 printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]",
296 node, link, (u64)start, (u64)end); 248 node, link, start, end);
297 /* 249 /*
298 * some sick allocation would have range overlap with fam10h 250 * some sick allocation would have range overlap with fam10h
299 * mmconf range, so need to update start and end. 251 * mmconf range, so need to update start and end.
@@ -318,14 +270,15 @@ static int __init early_fill_mp_bus_info(void)
318 /* we got a hole */ 270 /* we got a hole */
319 endx = fam10h_mmconf_start - 1; 271 endx = fam10h_mmconf_start - 1;
320 update_res(info, start, endx, IORESOURCE_MEM, 0); 272 update_res(info, start, endx, IORESOURCE_MEM, 0);
321 update_range(range, start, endx); 273 subtract_range(range, RANGE_NUM, start,
322 printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx); 274 endx + 1);
275 printk(KERN_CONT " ==> [%llx, %llx]", start, endx);
323 start = fam10h_mmconf_end + 1; 276 start = fam10h_mmconf_end + 1;
324 changed = 1; 277 changed = 1;
325 } 278 }
326 if (changed) { 279 if (changed) {
327 if (start <= end) { 280 if (start <= end) {
328 printk(KERN_CONT " %s [%llx, %llx]", endx?"and":"==>", (u64)start, (u64)end); 281 printk(KERN_CONT " %s [%llx, %llx]", endx ? "and" : "==>", start, end);
329 } else { 282 } else {
330 printk(KERN_CONT "%s\n", endx?"":" ==> none"); 283 printk(KERN_CONT "%s\n", endx?"":" ==> none");
331 continue; 284 continue;
@@ -333,8 +286,9 @@ static int __init early_fill_mp_bus_info(void)
333 } 286 }
334 } 287 }
335 288
336 update_res(info, start, end, IORESOURCE_MEM, 1); 289 update_res(info, cap_resource(start), cap_resource(end),
337 update_range(range, start, end); 290 IORESOURCE_MEM, 1);
291 subtract_range(range, RANGE_NUM, start, end + 1);
338 printk(KERN_CONT "\n"); 292 printk(KERN_CONT "\n");
339 } 293 }
340 294
@@ -348,8 +302,8 @@ static int __init early_fill_mp_bus_info(void)
348 address = MSR_K8_TOP_MEM2; 302 address = MSR_K8_TOP_MEM2;
349 rdmsrl(address, val); 303 rdmsrl(address, val);
350 end = (val & 0xffffff800000ULL); 304 end = (val & 0xffffff800000ULL);
351 printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); 305 printk(KERN_INFO "TOM2: %016llx aka %lldM\n", end, end>>20);
352 update_range(range, 1ULL<<32, end - 1); 306 subtract_range(range, RANGE_NUM, 1ULL<<32, end);
353 } 307 }
354 308
355 /* 309 /*
@@ -368,7 +322,8 @@ static int __init early_fill_mp_bus_info(void)
368 if (!range[i].end) 322 if (!range[i].end)
369 continue; 323 continue;
370 324
371 update_res(info, range[i].start, range[i].end, 325 update_res(info, cap_resource(range[i].start),
326 cap_resource(range[i].end - 1),
372 IORESOURCE_MEM, 1); 327 IORESOURCE_MEM, 1);
373 } 328 }
374 } 329 }
@@ -384,24 +339,14 @@ static int __init early_fill_mp_bus_info(void)
384 info->bus_min, info->bus_max, info->node, info->link); 339 info->bus_min, info->bus_max, info->node, info->link);
385 for (j = 0; j < res_num; j++) { 340 for (j = 0; j < res_num; j++) {
386 res = &info->res[j]; 341 res = &info->res[j];
387 printk(KERN_DEBUG "bus: %02x index %x %s: [%llx, %llx]\n", 342 printk(KERN_DEBUG "bus: %02x index %x %pR\n",
388 busnum, j, 343 busnum, j, res);
389 (res->flags & IORESOURCE_IO)?"io port":"mmio",
390 res->start, res->end);
391 } 344 }
392 } 345 }
393 346
394 return 0; 347 return 0;
395} 348}
396 349
397#else /* !CONFIG_X86_64 */
398
399static int __init early_fill_mp_bus_info(void) { return 0; }
400
401#endif /* !CONFIG_X86_64 */
402
403/* common 32/64 bit code */
404
405#define ENABLE_CF8_EXT_CFG (1ULL << 46) 350#define ENABLE_CF8_EXT_CFG (1ULL << 46)
406 351
407static void enable_pci_io_ecs(void *unused) 352static void enable_pci_io_ecs(void *unused)
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index f939d603adfa..64a122883896 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -1,11 +1,11 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/pci.h> 2#include <linux/pci.h>
3#include <linux/range.h>
3 4
4#include "bus_numa.h" 5#include "bus_numa.h"
5 6
6int pci_root_num; 7int pci_root_num;
7struct pci_root_info pci_root_info[PCI_ROOT_NR]; 8struct pci_root_info pci_root_info[PCI_ROOT_NR];
8int found_all_numa_early;
9 9
10void x86_pci_root_bus_res_quirks(struct pci_bus *b) 10void x86_pci_root_bus_res_quirks(struct pci_bus *b)
11{ 11{
@@ -21,10 +21,6 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
21 if (!pci_root_num) 21 if (!pci_root_num)
22 return; 22 return;
23 23
24 /* for amd, if only one root bus, don't need to do anything */
25 if (pci_root_num < 2 && found_all_numa_early)
26 return;
27
28 for (i = 0; i < pci_root_num; i++) { 24 for (i = 0; i < pci_root_num; i++) {
29 if (pci_root_info[i].bus_min == b->number) 25 if (pci_root_info[i].bus_min == b->number)
30 break; 26 break;
@@ -36,13 +32,14 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
36 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", 32 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
37 b->number); 33 b->number);
38 34
35 pci_bus_remove_resources(b);
39 info = &pci_root_info[i]; 36 info = &pci_root_info[i];
40 for (j = 0; j < info->res_num; j++) { 37 for (j = 0; j < info->res_num; j++) {
41 struct resource *res; 38 struct resource *res;
42 struct resource *root; 39 struct resource *root;
43 40
44 res = &info->res[j]; 41 res = &info->res[j];
45 b->resource[j] = res; 42 pci_bus_add_resource(b, res, 0);
46 if (res->flags & IORESOURCE_IO) 43 if (res->flags & IORESOURCE_IO)
47 root = &ioport_resource; 44 root = &ioport_resource;
48 else 45 else
@@ -51,8 +48,8 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
51 } 48 }
52} 49}
53 50
54void __devinit update_res(struct pci_root_info *info, size_t start, 51void __devinit update_res(struct pci_root_info *info, resource_size_t start,
55 size_t end, unsigned long flags, int merge) 52 resource_size_t end, unsigned long flags, int merge)
56{ 53{
57 int i; 54 int i;
58 struct resource *res; 55 struct resource *res;
@@ -60,25 +57,28 @@ void __devinit update_res(struct pci_root_info *info, size_t start,
60 if (start > end) 57 if (start > end)
61 return; 58 return;
62 59
60 if (start == MAX_RESOURCE)
61 return;
62
63 if (!merge) 63 if (!merge)
64 goto addit; 64 goto addit;
65 65
66 /* try to merge it with old one */ 66 /* try to merge it with old one */
67 for (i = 0; i < info->res_num; i++) { 67 for (i = 0; i < info->res_num; i++) {
68 size_t final_start, final_end; 68 resource_size_t final_start, final_end;
69 size_t common_start, common_end; 69 resource_size_t common_start, common_end;
70 70
71 res = &info->res[i]; 71 res = &info->res[i];
72 if (res->flags != flags) 72 if (res->flags != flags)
73 continue; 73 continue;
74 74
75 common_start = max((size_t)res->start, start); 75 common_start = max(res->start, start);
76 common_end = min((size_t)res->end, end); 76 common_end = min(res->end, end);
77 if (common_start > common_end + 1) 77 if (common_start > common_end + 1)
78 continue; 78 continue;
79 79
80 final_start = min((size_t)res->start, start); 80 final_start = min(res->start, start);
81 final_end = max((size_t)res->end, end); 81 final_end = max(res->end, end);
82 82
83 res->start = final_start; 83 res->start = final_start;
84 res->end = final_end; 84 res->end = final_end;
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
index adbc23fe82ac..804a4b40c31a 100644
--- a/arch/x86/pci/bus_numa.h
+++ b/arch/x86/pci/bus_numa.h
@@ -1,9 +1,8 @@
1#ifdef CONFIG_X86_64 1#ifndef __BUS_NUMA_H
2 2#define __BUS_NUMA_H
3/* 3/*
4 * sub bus (transparent) will use entres from 3 to store extra from 4 * sub bus (transparent) will use entres from 3 to store extra from
5 * root, so need to make sure we have enough slot there, Should we 5 * root, so need to make sure we have enough slot there.
6 * increase PCI_BUS_NUM_RESOURCES?
7 */ 6 */
8#define RES_NUM 16 7#define RES_NUM 16
9struct pci_root_info { 8struct pci_root_info {
@@ -20,8 +19,7 @@ struct pci_root_info {
20#define PCI_ROOT_NR 4 19#define PCI_ROOT_NR 4
21extern int pci_root_num; 20extern int pci_root_num;
22extern struct pci_root_info pci_root_info[PCI_ROOT_NR]; 21extern struct pci_root_info pci_root_info[PCI_ROOT_NR];
23extern int found_all_numa_early;
24 22
25extern void update_res(struct pci_root_info *info, size_t start, 23extern void update_res(struct pci_root_info *info, resource_size_t start,
26 size_t end, unsigned long flags, int merge); 24 resource_size_t end, unsigned long flags, int merge);
27#endif 25#endif
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index f5770b5846a6..294e10cb11e1 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -514,6 +514,9 @@ char * __devinit pcibios_setup(char *str)
514 } else if (!strcmp(str, "use_crs")) { 514 } else if (!strcmp(str, "use_crs")) {
515 pci_probe |= PCI_USE__CRS; 515 pci_probe |= PCI_USE__CRS;
516 return NULL; 516 return NULL;
517 } else if (!strcmp(str, "nocrs")) {
518 pci_probe |= PCI_ROOT_NO_CRS;
519 return NULL;
517 } else if (!strcmp(str, "earlydump")) { 520 } else if (!strcmp(str, "earlydump")) {
518 pci_early_dump_regs = 1; 521 pci_early_dump_regs = 1;
519 return NULL; 522 return NULL;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 5dc9e8c63fcd..dece3eb9c906 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -60,22 +60,20 @@ skip_isa_ioresource_align(struct pci_dev *dev) {
60 * but we want to try to avoid allocating at 0x2900-0x2bff 60 * but we want to try to avoid allocating at 0x2900-0x2bff
61 * which might have be mirrored at 0x0100-0x03ff.. 61 * which might have be mirrored at 0x0100-0x03ff..
62 */ 62 */
63void 63resource_size_t
64pcibios_align_resource(void *data, struct resource *res, 64pcibios_align_resource(void *data, const struct resource *res,
65 resource_size_t size, resource_size_t align) 65 resource_size_t size, resource_size_t align)
66{ 66{
67 struct pci_dev *dev = data; 67 struct pci_dev *dev = data;
68 resource_size_t start = res->start;
68 69
69 if (res->flags & IORESOURCE_IO) { 70 if (res->flags & IORESOURCE_IO) {
70 resource_size_t start = res->start;
71
72 if (skip_isa_ioresource_align(dev)) 71 if (skip_isa_ioresource_align(dev))
73 return; 72 return start;
74 if (start & 0x300) { 73 if (start & 0x300)
75 start = (start + 0x3ff) & ~0x3ff; 74 start = (start + 0x3ff) & ~0x3ff;
76 res->start = start;
77 }
78 } 75 }
76 return start;
79} 77}
80EXPORT_SYMBOL(pcibios_align_resource); 78EXPORT_SYMBOL(pcibios_align_resource);
81 79
@@ -257,10 +255,6 @@ void __init pcibios_resource_survey(void)
257 */ 255 */
258fs_initcall(pcibios_assign_resources); 256fs_initcall(pcibios_assign_resources);
259 257
260void __weak x86_pci_root_bus_res_quirks(struct pci_bus *b)
261{
262}
263
264/* 258/*
265 * If we set up a device for bus mastering, we need to check the latency 259 * If we set up a device for bus mastering, we need to check the latency
266 * timer as certain crappy BIOSes forget to set it properly. 260 * timer as certain crappy BIOSes forget to set it properly.
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index a60deb6e6696..8b107521d24e 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -590,6 +590,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
590 case PCI_DEVICE_ID_INTEL_ICH10_1: 590 case PCI_DEVICE_ID_INTEL_ICH10_1:
591 case PCI_DEVICE_ID_INTEL_ICH10_2: 591 case PCI_DEVICE_ID_INTEL_ICH10_2:
592 case PCI_DEVICE_ID_INTEL_ICH10_3: 592 case PCI_DEVICE_ID_INTEL_ICH10_3:
593 case PCI_DEVICE_ID_INTEL_CPT_LPC1:
594 case PCI_DEVICE_ID_INTEL_CPT_LPC2:
593 r->name = "PIIX/ICH"; 595 r->name = "PIIX/ICH";
594 r->get = pirq_piix_get; 596 r->get = pirq_piix_get;
595 r->set = pirq_piix_set; 597 r->set = pirq_piix_set;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index b19d1e54201e..8f3f9a50b1e0 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -303,22 +303,17 @@ static void __init pci_mmcfg_check_end_bus_number(void)
303{ 303{
304 struct pci_mmcfg_region *cfg, *cfgx; 304 struct pci_mmcfg_region *cfg, *cfgx;
305 305
306 /* last one*/ 306 /* Fixup overlaps */
307 cfg = list_entry(pci_mmcfg_list.prev, typeof(*cfg), list);
308 if (cfg)
309 if (cfg->end_bus < cfg->start_bus)
310 cfg->end_bus = 255;
311
312 if (list_is_singular(&pci_mmcfg_list))
313 return;
314
315 /* don't overlap please */
316 list_for_each_entry(cfg, &pci_mmcfg_list, list) { 307 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
317 if (cfg->end_bus < cfg->start_bus) 308 if (cfg->end_bus < cfg->start_bus)
318 cfg->end_bus = 255; 309 cfg->end_bus = 255;
319 310
311 /* Don't access the list head ! */
312 if (cfg->list.next == &pci_mmcfg_list)
313 break;
314
320 cfgx = list_entry(cfg->list.next, typeof(*cfg), list); 315 cfgx = list_entry(cfg->list.next, typeof(*cfg), list);
321 if (cfg != cfgx && cfg->end_bus >= cfgx->start_bus) 316 if (cfg->end_bus >= cfgx->start_bus)
322 cfg->end_bus = cfgx->start_bus - 1; 317 cfg->end_bus = cfgx->start_bus - 1;
323 } 318 }
324} 319}
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 45c0c9e45903..8223738ad806 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -8,9 +8,7 @@
8#include <asm/apic.h> 8#include <asm/apic.h>
9#include <asm/mpspec.h> 9#include <asm/mpspec.h>
10#include <asm/pci_x86.h> 10#include <asm/pci_x86.h>
11 11#include <asm/numaq.h>
12#define XQUAD_PORTIO_BASE 0xfe400000
13#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
14 12
15#define BUS2QUAD(global) (mp_bus_id_to_node[global]) 13#define BUS2QUAD(global) (mp_bus_id_to_node[global])
16 14
@@ -18,8 +16,6 @@
18 16
19#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) 17#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
20 18
21#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
22
23#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ 19#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \
24 (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3)) 20 (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3))
25 21
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c
index bee8d6ac2691..13403fc95a96 100644
--- a/arch/x86/tools/test_get_len.c
+++ b/arch/x86/tools/test_get_len.c
@@ -43,7 +43,7 @@ static int x86_64;
43static void usage(void) 43static void usage(void)
44{ 44{
45 fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |" 45 fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |"
46 " %s [-y|-n] [-v] \n", prog); 46 " %s [-y|-n] [-v]\n", prog);
47 fprintf(stderr, "\t-y 64bit mode\n"); 47 fprintf(stderr, "\t-y 64bit mode\n");
48 fprintf(stderr, "\t-n 32bit mode\n"); 48 fprintf(stderr, "\t-n 32bit mode\n");
49 fprintf(stderr, "\t-v verbose mode\n"); 49 fprintf(stderr, "\t-v verbose mode\n");
@@ -69,7 +69,7 @@ static void dump_field(FILE *fp, const char *name, const char *indent,
69 69
70static void dump_insn(FILE *fp, struct insn *insn) 70static void dump_insn(FILE *fp, struct insn *insn)
71{ 71{
72 fprintf(fp, "Instruction = { \n"); 72 fprintf(fp, "Instruction = {\n");
73 dump_field(fp, "prefixes", "\t", &insn->prefixes); 73 dump_field(fp, "prefixes", "\t", &insn->prefixes);
74 dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix); 74 dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix);
75 dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix); 75 dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 36daccb68642..b607239c1ba8 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -50,6 +50,7 @@
50#include <asm/traps.h> 50#include <asm/traps.h>
51#include <asm/setup.h> 51#include <asm/setup.h>
52#include <asm/desc.h> 52#include <asm/desc.h>
53#include <asm/pgalloc.h>
53#include <asm/pgtable.h> 54#include <asm/pgtable.h>
54#include <asm/tlbflush.h> 55#include <asm/tlbflush.h>
55#include <asm/reboot.h> 56#include <asm/reboot.h>
@@ -1094,6 +1095,12 @@ asmlinkage void __init xen_start_kernel(void)
1094 1095
1095 __supported_pte_mask |= _PAGE_IOMAP; 1096 __supported_pte_mask |= _PAGE_IOMAP;
1096 1097
1098 /*
1099 * Prevent page tables from being allocated in highmem, even
1100 * if CONFIG_HIGHPTE is enabled.
1101 */
1102 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
1103
1097 /* Work out if we support NX */ 1104 /* Work out if we support NX */
1098 x86_configure_nx(); 1105 x86_configure_nx();
1099 1106
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index bf4cd6bfe959..f9eb7de74f42 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1427,23 +1427,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1427#endif 1427#endif
1428} 1428}
1429 1429
1430#ifdef CONFIG_HIGHPTE
1431static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
1432{
1433 pgprot_t prot = PAGE_KERNEL;
1434
1435 if (PagePinned(page))
1436 prot = PAGE_KERNEL_RO;
1437
1438 if (0 && PageHighMem(page))
1439 printk("mapping highpte %lx type %d prot %s\n",
1440 page_to_pfn(page), type,
1441 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
1442
1443 return kmap_atomic_prot(page, type, prot);
1444}
1445#endif
1446
1447#ifdef CONFIG_X86_32 1430#ifdef CONFIG_X86_32
1448static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 1431static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1449{ 1432{
@@ -1902,10 +1885,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1902 .alloc_pmd_clone = paravirt_nop, 1885 .alloc_pmd_clone = paravirt_nop,
1903 .release_pmd = xen_release_pmd_init, 1886 .release_pmd = xen_release_pmd_init,
1904 1887
1905#ifdef CONFIG_HIGHPTE
1906 .kmap_atomic_pte = xen_kmap_atomic_pte,
1907#endif
1908
1909#ifdef CONFIG_X86_64 1888#ifdef CONFIG_X86_64
1910 .set_pte = xen_set_pte, 1889 .set_pte = xen_set_pte,
1911#else 1890#else
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 88e15deb8b82..22a2093b5862 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -90,9 +90,9 @@ ENTRY(xen_iret)
90 GET_THREAD_INFO(%eax) 90 GET_THREAD_INFO(%eax)
91 movl TI_cpu(%eax), %eax 91 movl TI_cpu(%eax), %eax
92 movl __per_cpu_offset(,%eax,4), %eax 92 movl __per_cpu_offset(,%eax,4), %eax
93 mov per_cpu__xen_vcpu(%eax), %eax 93 mov xen_vcpu(%eax), %eax
94#else 94#else
95 movl per_cpu__xen_vcpu, %eax 95 movl xen_vcpu, %eax
96#endif 96#endif
97 97
98 /* check IF state we're restoring */ 98 /* check IF state we're restoring */