aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2008-03-18 02:44:31 -0400
committerDavid S. Miller <davem@davemloft.net>2008-03-18 02:44:31 -0400
commit2f633928cbba8a5858bb39b11e7219a41b0fbef5 (patch)
tree9a82f4b7f2c3afe4b0208d8e44ea61bae90a7d22 /arch/x86
parent5e226e4d9016daee170699f8a4188a5505021756 (diff)
parentbde4f8fa8db2abd5ac9c542d76012d0fedab050f (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig8
-rw-r--r--arch/x86/Kconfig.cpu14
-rw-r--r--arch/x86/boot/memory.c9
-rw-r--r--arch/x86/boot/vesa.h9
-rw-r--r--arch/x86/boot/video-vesa.c2
-rw-r--r--arch/x86/ia32/ia32_signal.c4
-rw-r--r--arch/x86/kernel/asm-offsets_32.c4
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/e_powersaver.c8
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c9
-rw-r--r--arch/x86/kernel/cpu/transmeta.c7
-rw-r--r--arch/x86/kernel/entry_64.S6
-rw-r--r--arch/x86/kernel/head_32.S2
-rw-r--r--arch/x86/kernel/head_64.S22
-rw-r--r--arch/x86/kernel/hpet.c4
-rw-r--r--arch/x86/kernel/i387.c10
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c8
-rw-r--r--arch/x86/kernel/ptrace.c31
-rw-r--r--arch/x86/kernel/reboot.c6
-rw-r--r--arch/x86/kernel/setup_64.c2
-rw-r--r--arch/x86/kernel/signal_32.c4
-rw-r--r--arch/x86/kernel/signal_64.c40
-rw-r--r--arch/x86/kernel/smpboot_64.c2
-rw-r--r--arch/x86/kernel/stacktrace.c4
-rw-r--r--arch/x86/kernel/step.c4
-rw-r--r--arch/x86/kernel/tls.c8
-rw-r--r--arch/x86/kernel/tsc_32.c3
-rw-r--r--arch/x86/kernel/vsyscall_64.c49
-rw-r--r--arch/x86/kvm/lapic.c4
-rw-r--r--arch/x86/kvm/mmu.c38
-rw-r--r--arch/x86/kvm/paging_tmpl.h20
-rw-r--r--arch/x86/kvm/svm.c26
-rw-r--r--arch/x86/kvm/vmx.c14
-rw-r--r--arch/x86/kvm/x86.c114
-rw-r--r--arch/x86/lguest/boot.c63
-rw-r--r--arch/x86/mm/init_64.c13
-rw-r--r--arch/x86/mm/ioremap.c4
-rw-r--r--arch/x86/mm/numa_64.c8
-rw-r--r--arch/x86/mm/pageattr.c84
-rw-r--r--arch/x86/mm/pgtable_32.c18
-rw-r--r--arch/x86/pci/pcbios.c10
-rw-r--r--arch/x86/vdso/Makefile4
-rw-r--r--arch/x86/xen/enlighten.c1
-rw-r--r--arch/x86/xen/setup.c3
46 files changed, 418 insertions, 290 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4a88cf7695b4..6c70fed0f9a0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,7 +21,8 @@ config X86
21 select HAVE_IDE 21 select HAVE_IDE
22 select HAVE_OPROFILE 22 select HAVE_OPROFILE
23 select HAVE_KPROBES 23 select HAVE_KPROBES
24 select HAVE_KVM 24 select HAVE_KRETPROBES
25 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
25 26
26 27
27config GENERIC_LOCKBREAK 28config GENERIC_LOCKBREAK
@@ -65,9 +66,6 @@ config MMU
65config ZONE_DMA 66config ZONE_DMA
66 def_bool y 67 def_bool y
67 68
68config QUICKLIST
69 def_bool X86_32
70
71config SBUS 69config SBUS
72 bool 70 bool
73 71
@@ -1261,7 +1259,7 @@ menuconfig APM
1261 machines with more than one CPU. 1259 machines with more than one CPU.
1262 1260
1263 In order to use APM, you will need supporting software. For location 1261 In order to use APM, you will need supporting software. For location
1264 and more information, read <file:Documentation/pm.txt> and the 1262 and more information, read <file:Documentation/power/pm.txt> and the
1265 Battery Powered Linux mini-HOWTO, available from 1263 Battery Powered Linux mini-HOWTO, available from
1266 <http://www.tldp.org/docs.html#howto>. 1264 <http://www.tldp.org/docs.html#howto>.
1267 1265
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index e09a6b73a1aa..9304bfba7d45 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -377,6 +377,19 @@ config X86_OOSTORE
377 def_bool y 377 def_bool y
378 depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR 378 depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR
379 379
380#
381# P6_NOPs are a relatively minor optimization that require a family >=
382# 6 processor, except that it is broken on certain VIA chips.
383# Furthermore, AMD chips prefer a totally different sequence of NOPs
384# (which work on all CPUs). As a result, disallow these if we're
385# compiling X86_GENERIC but not X86_64 (these NOPs do work on all
386# x86-64 capable chips); the list of processors in the right-hand clause
387# are the cores that benefit from this optimization.
388#
389config X86_P6_NOP
390 def_bool y
391 depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4)
392
380config X86_TSC 393config X86_TSC
381 def_bool y 394 def_bool y
382 depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 395 depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
@@ -390,6 +403,7 @@ config X86_CMOV
390config X86_MINIMUM_CPU_FAMILY 403config X86_MINIMUM_CPU_FAMILY
391 int 404 int
392 default "64" if X86_64 405 default "64" if X86_64
406 default "6" if X86_32 && X86_P6_NOP
393 default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK) 407 default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK)
394 default "3" 408 default "3"
395 409
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 378353956b5d..e77d89f9e8aa 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -37,6 +37,12 @@ static int detect_memory_e820(void)
37 "=m" (*desc) 37 "=m" (*desc)
38 : "D" (desc), "d" (SMAP), "a" (0xe820)); 38 : "D" (desc), "d" (SMAP), "a" (0xe820));
39 39
40 /* BIOSes which terminate the chain with CF = 1 as opposed
41 to %ebx = 0 don't always report the SMAP signature on
42 the final, failing, probe. */
43 if (err)
44 break;
45
40 /* Some BIOSes stop returning SMAP in the middle of 46 /* Some BIOSes stop returning SMAP in the middle of
41 the search loop. We don't know exactly how the BIOS 47 the search loop. We don't know exactly how the BIOS
42 screwed up the map at that point, we might have a 48 screwed up the map at that point, we might have a
@@ -47,9 +53,6 @@ static int detect_memory_e820(void)
47 break; 53 break;
48 } 54 }
49 55
50 if (err)
51 break;
52
53 count++; 56 count++;
54 desc++; 57 desc++;
55 } while (next && count < E820MAX); 58 } while (next && count < E820MAX);
diff --git a/arch/x86/boot/vesa.h b/arch/x86/boot/vesa.h
index ff5b73cd406f..468e444622c5 100644
--- a/arch/x86/boot/vesa.h
+++ b/arch/x86/boot/vesa.h
@@ -26,17 +26,10 @@ struct vesa_general_info {
26 far_ptr video_mode_ptr; /* 14 */ 26 far_ptr video_mode_ptr; /* 14 */
27 u16 total_memory; /* 18 */ 27 u16 total_memory; /* 18 */
28 28
29 u16 oem_software_rev; /* 20 */ 29 u8 reserved[236]; /* 20 */
30 far_ptr oem_vendor_name_ptr; /* 22 */
31 far_ptr oem_product_name_ptr; /* 26 */
32 far_ptr oem_product_rev_ptr; /* 30 */
33
34 u8 reserved[222]; /* 34 */
35 u8 oem_data[256]; /* 256 */
36} __attribute__ ((packed)); 30} __attribute__ ((packed));
37 31
38#define VESA_MAGIC ('V' + ('E' << 8) + ('S' << 16) + ('A' << 24)) 32#define VESA_MAGIC ('V' + ('E' << 8) + ('S' << 16) + ('A' << 24))
39#define VBE2_MAGIC ('V' + ('B' << 8) + ('E' << 16) + ('2' << 24))
40 33
41struct vesa_mode_info { 34struct vesa_mode_info {
42 u16 mode_attr; /* 0 */ 35 u16 mode_attr; /* 0 */
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 662dd2f13068..419b5c273374 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -37,8 +37,6 @@ static int vesa_probe(void)
37 37
38 video_vesa.modes = GET_HEAP(struct mode_info, 0); 38 video_vesa.modes = GET_HEAP(struct mode_info, 0);
39 39
40 vginfo.signature = VBE2_MAGIC;
41
42 ax = 0x4f00; 40 ax = 0x4f00;
43 di = (size_t)&vginfo; 41 di = (size_t)&vginfo;
44 asm(INT10 42 asm(INT10
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 1c0503bdfb1a..5e7771a3ba2f 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -500,7 +500,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
500 regs->ss = __USER32_DS; 500 regs->ss = __USER32_DS;
501 501
502 set_fs(USER_DS); 502 set_fs(USER_DS);
503 regs->flags &= ~X86_EFLAGS_TF; 503 regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF);
504 if (test_thread_flag(TIF_SINGLESTEP)) 504 if (test_thread_flag(TIF_SINGLESTEP))
505 ptrace_notify(SIGTRAP); 505 ptrace_notify(SIGTRAP);
506 506
@@ -600,7 +600,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
600 regs->ss = __USER32_DS; 600 regs->ss = __USER32_DS;
601 601
602 set_fs(USER_DS); 602 set_fs(USER_DS);
603 regs->flags &= ~X86_EFLAGS_TF; 603 regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF);
604 if (test_thread_flag(TIF_SINGLESTEP)) 604 if (test_thread_flag(TIF_SINGLESTEP))
605 ptrace_notify(SIGTRAP); 605 ptrace_notify(SIGTRAP);
606 606
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index a33d53017997..8ea040124f7d 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -128,13 +128,11 @@ void foo(void)
128 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); 128 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
129#endif 129#endif
130 130
131#ifdef CONFIG_LGUEST_GUEST 131#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
132 BLANK(); 132 BLANK();
133 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 133 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
134 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); 134 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
135#endif
136 135
137#ifdef CONFIG_LGUEST
138 BLANK(); 136 BLANK();
139 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); 137 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
140 OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); 138 OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f86a3c4a2669..a38aafaefc23 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -504,7 +504,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
504 504
505 /* Clear all flags overriden by options */ 505 /* Clear all flags overriden by options */
506 for (i = 0; i < NCAPINTS; i++) 506 for (i = 0; i < NCAPINTS; i++)
507 c->x86_capability[i] ^= cleared_cpu_caps[i]; 507 c->x86_capability[i] &= ~cleared_cpu_caps[i];
508 508
509 /* Init Machine Check Exception if available. */ 509 /* Init Machine Check Exception if available. */
510 mcheck_init(c); 510 mcheck_init(c);
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
index 39f8cb18296c..c2f930d86640 100644
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
@@ -55,7 +55,6 @@ static int eps_set_state(struct eps_cpu_data *centaur,
55{ 55{
56 struct cpufreq_freqs freqs; 56 struct cpufreq_freqs freqs;
57 u32 lo, hi; 57 u32 lo, hi;
58 u8 current_multiplier, current_voltage;
59 int err = 0; 58 int err = 0;
60 int i; 59 int i;
61 60
@@ -95,6 +94,10 @@ postchange:
95 rdmsr(MSR_IA32_PERF_STATUS, lo, hi); 94 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
96 freqs.new = centaur->fsb * ((lo >> 8) & 0xff); 95 freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
97 96
97#ifdef DEBUG
98 {
99 u8 current_multiplier, current_voltage;
100
98 /* Print voltage and multiplier */ 101 /* Print voltage and multiplier */
99 rdmsr(MSR_IA32_PERF_STATUS, lo, hi); 102 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
100 current_voltage = lo & 0xff; 103 current_voltage = lo & 0xff;
@@ -103,7 +106,8 @@ postchange:
103 current_multiplier = (lo >> 8) & 0xff; 106 current_multiplier = (lo >> 8) & 0xff;
104 printk(KERN_INFO "eps: Current multiplier = %d\n", 107 printk(KERN_INFO "eps: Current multiplier = %d\n",
105 current_multiplier); 108 current_multiplier);
106 109 }
110#endif
107 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 111 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
108 return err; 112 return err;
109} 113}
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index b6e136f23d3d..be83336fddba 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -43,6 +43,7 @@
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
44#include <asm/processor.h> 44#include <asm/processor.h>
45#include <asm/msr.h> 45#include <asm/msr.h>
46#include <asm/kvm_para.h>
46#include "mtrr.h" 47#include "mtrr.h"
47 48
48u32 num_var_ranges = 0; 49u32 num_var_ranges = 0;
@@ -649,6 +650,7 @@ static __init int amd_special_default_mtrr(void)
649 650
650/** 651/**
651 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs 652 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
653 * @end_pfn: ending page frame number
652 * 654 *
653 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain 655 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
654 * memory configurations. This routine checks that the highest MTRR matches 656 * memory configurations. This routine checks that the highest MTRR matches
@@ -688,8 +690,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
688 690
689 /* kvm/qemu doesn't have mtrr set right, don't trim them all */ 691 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
690 if (!highest_pfn) { 692 if (!highest_pfn) {
691 printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); 693 if (!kvm_para_available()) {
692 WARN_ON(1); 694 printk(KERN_WARNING
695 "WARNING: strange, CPU MTRRs all blank?\n");
696 WARN_ON(1);
697 }
693 return 0; 698 return 0;
694 } 699 }
695 700
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 200fb3f9ebfb..e8b422c1c512 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -76,13 +76,6 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
76 /* All Transmeta CPUs have a constant TSC */ 76 /* All Transmeta CPUs have a constant TSC */
77 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); 77 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
78 78
79 /* If we can run i686 user-space code, call us an i686 */
80#define USER686 ((1 << X86_FEATURE_TSC)|\
81 (1 << X86_FEATURE_CX8)|\
82 (1 << X86_FEATURE_CMOV))
83 if (c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686)
84 c->x86 = 6;
85
86#ifdef CONFIG_SYSCTL 79#ifdef CONFIG_SYSCTL
87 /* randomize_va_space slows us down enormously; 80 /* randomize_va_space slows us down enormously;
88 it probably triggers retranslation of x86->native bytecode */ 81 it probably triggers retranslation of x86->native bytecode */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2ad9a1bc6a73..c20c9e7e08dd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -453,6 +453,7 @@ ENTRY(stub_execve)
453 CFI_REGISTER rip, r11 453 CFI_REGISTER rip, r11
454 SAVE_REST 454 SAVE_REST
455 FIXUP_TOP_OF_STACK %r11 455 FIXUP_TOP_OF_STACK %r11
456 movq %rsp, %rcx
456 call sys_execve 457 call sys_execve
457 RESTORE_TOP_OF_STACK %r11 458 RESTORE_TOP_OF_STACK %r11
458 movq %rax,RAX(%rsp) 459 movq %rax,RAX(%rsp)
@@ -1036,15 +1037,16 @@ ENDPROC(child_rip)
1036 * rdi: name, rsi: argv, rdx: envp 1037 * rdi: name, rsi: argv, rdx: envp
1037 * 1038 *
1038 * We want to fallback into: 1039 * We want to fallback into:
1039 * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) 1040 * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
1040 * 1041 *
1041 * do_sys_execve asm fallback arguments: 1042 * do_sys_execve asm fallback arguments:
1042 * rdi: name, rsi: argv, rdx: envp, fake frame on the stack 1043 * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
1043 */ 1044 */
1044ENTRY(kernel_execve) 1045ENTRY(kernel_execve)
1045 CFI_STARTPROC 1046 CFI_STARTPROC
1046 FAKE_STACK_FRAME $0 1047 FAKE_STACK_FRAME $0
1047 SAVE_ALL 1048 SAVE_ALL
1049 movq %rsp,%rcx
1048 call sys_execve 1050 call sys_execve
1049 movq %rax, RAX(%rsp) 1051 movq %rax, RAX(%rsp)
1050 RESTORE_REST 1052 RESTORE_REST
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 25eb98540a41..fd8ca53943a8 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -606,7 +606,7 @@ ENTRY(_stext)
606.section ".bss.page_aligned","wa" 606.section ".bss.page_aligned","wa"
607 .align PAGE_SIZE_asm 607 .align PAGE_SIZE_asm
608#ifdef CONFIG_X86_PAE 608#ifdef CONFIG_X86_PAE
609ENTRY(swapper_pg_pmd) 609swapper_pg_pmd:
610 .fill 1024*KPMDS,4,0 610 .fill 1024*KPMDS,4,0
611#else 611#else
612ENTRY(swapper_pg_dir) 612ENTRY(swapper_pg_dir)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index eb415043a929..a007454133a3 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -379,18 +379,24 @@ NEXT_PAGE(level2_ident_pgt)
379 /* Since I easily can, map the first 1G. 379 /* Since I easily can, map the first 1G.
380 * Don't set NX because code runs from these pages. 380 * Don't set NX because code runs from these pages.
381 */ 381 */
382 PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) 382 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
383 383
384NEXT_PAGE(level2_kernel_pgt) 384NEXT_PAGE(level2_kernel_pgt)
385 /* 40MB kernel mapping. The kernel code cannot be bigger than that. 385 /*
386 When you change this change KERNEL_TEXT_SIZE in page.h too. */ 386 * 128 MB kernel mapping. We spend a full page on this pagetable
387 /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ 387 * anyway.
388 PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE) 388 *
389 /* Module mapping starts here */ 389 * The kernel code+data+bss must not be bigger than that.
390 .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 390 *
391 * (NOTE: at +128MB starts the module area, see MODULES_VADDR.
392 * If you want to increase this then increase MODULES_VADDR
393 * too.)
394 */
395 PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL,
396 KERNEL_IMAGE_SIZE/PMD_SIZE)
391 397
392NEXT_PAGE(level2_spare_pgt) 398NEXT_PAGE(level2_spare_pgt)
393 .fill 512,8,0 399 .fill 512, 8, 0
394 400
395#undef PMDS 401#undef PMDS
396#undef NEXT_PAGE 402#undef NEXT_PAGE
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 429d084e014d..235fd6c77504 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -368,8 +368,8 @@ static int hpet_clocksource_register(void)
368 return 0; 368 return 0;
369} 369}
370 370
371/* 371/**
372 * Try to setup the HPET timer 372 * hpet_enable - Try to setup the HPET timer. Returns 1 on success.
373 */ 373 */
374int __init hpet_enable(void) 374int __init hpet_enable(void)
375{ 375{
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 763dfc407232..d2e39e69aaf8 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -132,7 +132,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
132 if (!cpu_has_fxsr) 132 if (!cpu_has_fxsr)
133 return -ENODEV; 133 return -ENODEV;
134 134
135 unlazy_fpu(target); 135 init_fpu(target);
136 136
137 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, 137 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
138 &target->thread.i387.fxsave, 0, -1); 138 &target->thread.i387.fxsave, 0, -1);
@@ -147,7 +147,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
147 if (!cpu_has_fxsr) 147 if (!cpu_has_fxsr)
148 return -ENODEV; 148 return -ENODEV;
149 149
150 unlazy_fpu(target); 150 init_fpu(target);
151 set_stopped_child_used_math(target); 151 set_stopped_child_used_math(target);
152 152
153 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 153 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
@@ -261,7 +261,7 @@ static void convert_from_fxsr(struct user_i387_ia32_struct *env,
261 } 261 }
262#else 262#else
263 env->fip = fxsave->fip; 263 env->fip = fxsave->fip;
264 env->fcs = fxsave->fcs; 264 env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
265 env->foo = fxsave->foo; 265 env->foo = fxsave->foo;
266 env->fos = fxsave->fos; 266 env->fos = fxsave->fos;
267#endif 267#endif
@@ -307,7 +307,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
307 if (!HAVE_HWFP) 307 if (!HAVE_HWFP)
308 return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); 308 return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
309 309
310 unlazy_fpu(target); 310 init_fpu(target);
311 311
312 if (!cpu_has_fxsr) 312 if (!cpu_has_fxsr)
313 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, 313 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
@@ -332,7 +332,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
332 if (!HAVE_HWFP) 332 if (!HAVE_HWFP)
333 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); 333 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
334 334
335 unlazy_fpu(target); 335 init_fpu(target);
336 set_stopped_child_used_math(target); 336 set_stopped_child_used_math(target);
337 337
338 if (!cpu_has_fxsr) 338 if (!cpu_has_fxsr)
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 5b3ce7934363..3d01e47777db 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -15,6 +15,7 @@ static struct files_struct init_files = INIT_FILES;
15static struct signal_struct init_signals = INIT_SIGNALS(init_signals); 15static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
16static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); 16static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
17struct mm_struct init_mm = INIT_MM(init_mm); 17struct mm_struct init_mm = INIT_MM(init_mm);
18EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
18 19
19/* 20/*
20 * Initial thread structure. 21 * Initial thread structure.
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a7d50a547dc2..be3c7a299f02 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -603,11 +603,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
603 } 603 }
604#endif 604#endif
605 605
606#ifdef X86_BTS
606 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 607 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
607 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 608 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
608 609
609 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 610 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
610 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 611 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
612#endif
611 613
612 614
613 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 615 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b0cc8f0136d8..3baf9b9f4c87 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -604,11 +604,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
604 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 604 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
605 } 605 }
606 606
607#ifdef X86_BTS
607 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 608 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
608 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 609 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
609 610
610 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 611 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
611 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 612 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
613#endif
612} 614}
613 615
614/* 616/*
@@ -730,16 +732,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
730 */ 732 */
731asmlinkage 733asmlinkage
732long sys_execve(char __user *name, char __user * __user *argv, 734long sys_execve(char __user *name, char __user * __user *argv,
733 char __user * __user *envp, struct pt_regs regs) 735 char __user * __user *envp, struct pt_regs *regs)
734{ 736{
735 long error; 737 long error;
736 char * filename; 738 char * filename;
737 739
738 filename = getname(name); 740 filename = getname(name);
739 error = PTR_ERR(filename); 741 error = PTR_ERR(filename);
740 if (IS_ERR(filename)) 742 if (IS_ERR(filename))
741 return error; 743 return error;
742 error = do_execve(filename, argv, envp, &regs); 744 error = do_execve(filename, argv, envp, regs);
743 putname(filename); 745 putname(filename);
744 return error; 746 return error;
745} 747}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index d862e396b099..d5904eef1d31 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -323,6 +323,16 @@ static int putreg(struct task_struct *child,
323 return set_flags(child, value); 323 return set_flags(child, value);
324 324
325#ifdef CONFIG_X86_64 325#ifdef CONFIG_X86_64
326 /*
327 * Orig_ax is really just a flag with small positive and
328 * negative values, so make sure to always sign-extend it
329 * from 32 bits so that it works correctly regardless of
330 * whether we come from a 32-bit environment or not.
331 */
332 case offsetof(struct user_regs_struct, orig_ax):
333 value = (long) (s32) value;
334 break;
335
326 case offsetof(struct user_regs_struct,fs_base): 336 case offsetof(struct user_regs_struct,fs_base):
327 if (value >= TASK_SIZE_OF(child)) 337 if (value >= TASK_SIZE_OF(child))
328 return -EIO; 338 return -EIO;
@@ -544,6 +554,8 @@ static int ptrace_set_debugreg(struct task_struct *child,
544 return 0; 554 return 0;
545} 555}
546 556
557#ifdef X86_BTS
558
547static int ptrace_bts_get_size(struct task_struct *child) 559static int ptrace_bts_get_size(struct task_struct *child)
548{ 560{
549 if (!child->thread.ds_area_msr) 561 if (!child->thread.ds_area_msr)
@@ -826,6 +838,7 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
826 838
827 ptrace_bts_write_record(tsk, &rec); 839 ptrace_bts_write_record(tsk, &rec);
828} 840}
841#endif /* X86_BTS */
829 842
830/* 843/*
831 * Called by kernel/ptrace.c when detaching.. 844 * Called by kernel/ptrace.c when detaching..
@@ -839,7 +852,9 @@ void ptrace_disable(struct task_struct *child)
839 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 852 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
840#endif 853#endif
841 if (child->thread.ds_area_msr) { 854 if (child->thread.ds_area_msr) {
855#ifdef X86_BTS
842 ptrace_bts_realloc(child, 0, 0); 856 ptrace_bts_realloc(child, 0, 0);
857#endif
843 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 858 child->thread.debugctlmsr &= ~ds_debugctl_mask();
844 if (!child->thread.debugctlmsr) 859 if (!child->thread.debugctlmsr)
845 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 860 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
@@ -961,6 +976,10 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
961 break; 976 break;
962#endif 977#endif
963 978
979 /*
980 * These bits need more cooking - not enabled yet:
981 */
982#ifdef X86_BTS
964 case PTRACE_BTS_CONFIG: 983 case PTRACE_BTS_CONFIG:
965 ret = ptrace_bts_config 984 ret = ptrace_bts_config
966 (child, data, (struct ptrace_bts_config __user *)addr); 985 (child, data, (struct ptrace_bts_config __user *)addr);
@@ -988,6 +1007,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
988 ret = ptrace_bts_drain 1007 ret = ptrace_bts_drain
989 (child, data, (struct bts_struct __user *) addr); 1008 (child, data, (struct bts_struct __user *) addr);
990 break; 1009 break;
1010#endif
991 1011
992 default: 1012 default:
993 ret = ptrace_request(child, request, addr, data); 1013 ret = ptrace_request(child, request, addr, data);
@@ -1035,10 +1055,17 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
1035 R32(esi, si); 1055 R32(esi, si);
1036 R32(ebp, bp); 1056 R32(ebp, bp);
1037 R32(eax, ax); 1057 R32(eax, ax);
1038 R32(orig_eax, orig_ax);
1039 R32(eip, ip); 1058 R32(eip, ip);
1040 R32(esp, sp); 1059 R32(esp, sp);
1041 1060
1061 case offsetof(struct user32, regs.orig_eax):
1062 /*
1063 * Sign-extend the value so that orig_eax = -1
1064 * causes (long)orig_ax < 0 tests to fire correctly.
1065 */
1066 regs->orig_ax = (long) (s32) value;
1067 break;
1068
1042 case offsetof(struct user32, regs.eflags): 1069 case offsetof(struct user32, regs.eflags):
1043 return set_flags(child, value); 1070 return set_flags(child, value);
1044 1071
@@ -1226,12 +1253,14 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
1226 case PTRACE_SETOPTIONS: 1253 case PTRACE_SETOPTIONS:
1227 case PTRACE_SET_THREAD_AREA: 1254 case PTRACE_SET_THREAD_AREA:
1228 case PTRACE_GET_THREAD_AREA: 1255 case PTRACE_GET_THREAD_AREA:
1256#ifdef X86_BTS
1229 case PTRACE_BTS_CONFIG: 1257 case PTRACE_BTS_CONFIG:
1230 case PTRACE_BTS_STATUS: 1258 case PTRACE_BTS_STATUS:
1231 case PTRACE_BTS_SIZE: 1259 case PTRACE_BTS_SIZE:
1232 case PTRACE_BTS_GET: 1260 case PTRACE_BTS_GET:
1233 case PTRACE_BTS_CLEAR: 1261 case PTRACE_BTS_CLEAR:
1234 case PTRACE_BTS_DRAIN: 1262 case PTRACE_BTS_DRAIN:
1263#endif
1235 return sys_ptrace(request, pid, addr, data); 1264 return sys_ptrace(request, pid, addr, data);
1236 1265
1237 default: 1266 default:
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 7fd6ac43e4a1..55ceb8cdef75 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -326,6 +326,10 @@ static inline void kb_wait(void)
326 } 326 }
327} 327}
328 328
329void __attribute__((weak)) mach_reboot_fixups(void)
330{
331}
332
329static void native_machine_emergency_restart(void) 333static void native_machine_emergency_restart(void)
330{ 334{
331 int i; 335 int i;
@@ -337,6 +341,8 @@ static void native_machine_emergency_restart(void)
337 /* Could also try the reset bit in the Hammer NB */ 341 /* Could also try the reset bit in the Hammer NB */
338 switch (reboot_type) { 342 switch (reboot_type) {
339 case BOOT_KBD: 343 case BOOT_KBD:
344 mach_reboot_fixups(); /* for board specific fixups */
345
340 for (i = 0; i < 10; i++) { 346 for (i = 0; i < 10; i++) {
341 kb_wait(); 347 kb_wait();
342 udelay(50); 348 udelay(50);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 6fd804f07821..7637dc91c79b 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -1021,7 +1021,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
1021 1021
1022 /* Clear all flags overriden by options */ 1022 /* Clear all flags overriden by options */
1023 for (i = 0; i < NCAPINTS; i++) 1023 for (i = 0; i < NCAPINTS; i++)
1024 c->x86_capability[i] ^= cleared_cpu_caps[i]; 1024 c->x86_capability[i] &= ~cleared_cpu_caps[i];
1025 1025
1026#ifdef CONFIG_X86_MCE 1026#ifdef CONFIG_X86_MCE
1027 mcheck_init(c); 1027 mcheck_init(c);
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index caee1f002fed..0157a6f0f41f 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -407,7 +407,7 @@ static int setup_frame(int sig, struct k_sigaction *ka,
407 * The tracer may want to single-step inside the 407 * The tracer may want to single-step inside the
408 * handler too. 408 * handler too.
409 */ 409 */
410 regs->flags &= ~TF_MASK; 410 regs->flags &= ~(TF_MASK | X86_EFLAGS_DF);
411 if (test_thread_flag(TIF_SINGLESTEP)) 411 if (test_thread_flag(TIF_SINGLESTEP))
412 ptrace_notify(SIGTRAP); 412 ptrace_notify(SIGTRAP);
413 413
@@ -500,7 +500,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
500 * The tracer may want to single-step inside the 500 * The tracer may want to single-step inside the
501 * handler too. 501 * handler too.
502 */ 502 */
503 regs->flags &= ~TF_MASK; 503 regs->flags &= ~(TF_MASK | X86_EFLAGS_DF);
504 if (test_thread_flag(TIF_SINGLESTEP)) 504 if (test_thread_flag(TIF_SINGLESTEP))
505 ptrace_notify(SIGTRAP); 505 ptrace_notify(SIGTRAP);
506 506
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 7347bb14e306..1c83e5124c65 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -295,7 +295,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
295 see include/asm-x86_64/uaccess.h for details. */ 295 see include/asm-x86_64/uaccess.h for details. */
296 set_fs(USER_DS); 296 set_fs(USER_DS);
297 297
298 regs->flags &= ~X86_EFLAGS_TF; 298 regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF);
299 if (test_thread_flag(TIF_SINGLESTEP)) 299 if (test_thread_flag(TIF_SINGLESTEP))
300 ptrace_notify(SIGTRAP); 300 ptrace_notify(SIGTRAP);
301#ifdef DEBUG_SIG 301#ifdef DEBUG_SIG
@@ -311,6 +311,35 @@ give_sigsegv:
311} 311}
312 312
313/* 313/*
314 * Return -1L or the syscall number that @regs is executing.
315 */
316static long current_syscall(struct pt_regs *regs)
317{
318 /*
319 * We always sign-extend a -1 value being set here,
320 * so this is always either -1L or a syscall number.
321 */
322 return regs->orig_ax;
323}
324
325/*
326 * Return a value that is -EFOO if the system call in @regs->orig_ax
327 * returned an error. This only works for @regs from @current.
328 */
329static long current_syscall_ret(struct pt_regs *regs)
330{
331#ifdef CONFIG_IA32_EMULATION
332 if (test_thread_flag(TIF_IA32))
333 /*
334 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
335 * and will match correctly in comparisons.
336 */
337 return (int) regs->ax;
338#endif
339 return regs->ax;
340}
341
342/*
314 * OK, we're invoking a handler 343 * OK, we're invoking a handler
315 */ 344 */
316 345
@@ -327,9 +356,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
327#endif 356#endif
328 357
329 /* Are we from a system call? */ 358 /* Are we from a system call? */
330 if ((long)regs->orig_ax >= 0) { 359 if (current_syscall(regs) >= 0) {
331 /* If so, check system call restarting.. */ 360 /* If so, check system call restarting.. */
332 switch (regs->ax) { 361 switch (current_syscall_ret(regs)) {
333 case -ERESTART_RESTARTBLOCK: 362 case -ERESTART_RESTARTBLOCK:
334 case -ERESTARTNOHAND: 363 case -ERESTARTNOHAND:
335 regs->ax = -EINTR; 364 regs->ax = -EINTR;
@@ -426,10 +455,9 @@ static void do_signal(struct pt_regs *regs)
426 } 455 }
427 456
428 /* Did we come from a system call? */ 457 /* Did we come from a system call? */
429 if ((long)regs->orig_ax >= 0) { 458 if (current_syscall(regs) >= 0) {
430 /* Restart the system call - no handlers present */ 459 /* Restart the system call - no handlers present */
431 long res = regs->ax; 460 switch (current_syscall_ret(regs)) {
432 switch (res) {
433 case -ERESTARTNOHAND: 461 case -ERESTARTNOHAND:
434 case -ERESTARTSYS: 462 case -ERESTARTSYS:
435 case -ERESTARTNOINTR: 463 case -ERESTARTNOINTR:
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index d53bd6fcb428..0880f2c388a9 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -554,10 +554,10 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
554 int timeout; 554 int timeout;
555 unsigned long start_rip; 555 unsigned long start_rip;
556 struct create_idle c_idle = { 556 struct create_idle c_idle = {
557 .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle),
558 .cpu = cpu, 557 .cpu = cpu,
559 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), 558 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
560 }; 559 };
560 INIT_WORK(&c_idle.work, do_fork_idle);
561 561
562 /* allocate memory for gdts of secondary cpus. Hotplug is considered */ 562 /* allocate memory for gdts of secondary cpus. Hotplug is considered */
563 if (!cpu_gdt_descr[cpu].address && 563 if (!cpu_gdt_descr[cpu].address &&
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 02f0f61f5b11..c28c342c162f 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -25,6 +25,8 @@ static int save_stack_stack(void *data, char *name)
25static void save_stack_address(void *data, unsigned long addr, int reliable) 25static void save_stack_address(void *data, unsigned long addr, int reliable)
26{ 26{
27 struct stack_trace *trace = data; 27 struct stack_trace *trace = data;
28 if (!reliable)
29 return;
28 if (trace->skip > 0) { 30 if (trace->skip > 0) {
29 trace->skip--; 31 trace->skip--;
30 return; 32 return;
@@ -37,6 +39,8 @@ static void
37save_stack_address_nosched(void *data, unsigned long addr, int reliable) 39save_stack_address_nosched(void *data, unsigned long addr, int reliable)
38{ 40{
39 struct stack_trace *trace = (struct stack_trace *)data; 41 struct stack_trace *trace = (struct stack_trace *)data;
42 if (!reliable)
43 return;
40 if (in_sched_functions(addr)) 44 if (in_sched_functions(addr))
41 return; 45 return;
42 if (trace->skip > 0) { 46 if (trace->skip > 0) {
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 2ef1a5f8d675..9d406cdc847f 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -166,7 +166,7 @@ static void enable_step(struct task_struct *child, bool block)
166 child->thread.debugctlmsr | DEBUGCTLMSR_BTF); 166 child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
167 } else { 167 } else {
168 write_debugctlmsr(child, 168 write_debugctlmsr(child,
169 child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR); 169 child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
170 170
171 if (!child->thread.debugctlmsr) 171 if (!child->thread.debugctlmsr)
172 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 172 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
@@ -189,7 +189,7 @@ void user_disable_single_step(struct task_struct *child)
189 * Make sure block stepping (BTF) is disabled. 189 * Make sure block stepping (BTF) is disabled.
190 */ 190 */
191 write_debugctlmsr(child, 191 write_debugctlmsr(child,
192 child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR); 192 child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
193 193
194 if (!child->thread.debugctlmsr) 194 if (!child->thread.debugctlmsr)
195 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 195 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 6dfd4e76661a..022bcaa3b42e 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -91,7 +91,9 @@ int do_set_thread_area(struct task_struct *p, int idx,
91 91
92asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) 92asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
93{ 93{
94 return do_set_thread_area(current, -1, u_info, 1); 94 int ret = do_set_thread_area(current, -1, u_info, 1);
95 prevent_tail_call(ret);
96 return ret;
95} 97}
96 98
97 99
@@ -139,7 +141,9 @@ int do_get_thread_area(struct task_struct *p, int idx,
139 141
140asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) 142asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
141{ 143{
142 return do_get_thread_area(current, -1, u_info); 144 int ret = do_get_thread_area(current, -1, u_info);
145 prevent_tail_call(ret);
146 return ret;
143} 147}
144 148
145int regset_tls_active(struct task_struct *target, 149int regset_tls_active(struct task_struct *target,
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 43517e324be8..f14cfd9d1f94 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -28,7 +28,8 @@ EXPORT_SYMBOL_GPL(tsc_khz);
28static int __init tsc_setup(char *str) 28static int __init tsc_setup(char *str)
29{ 29{
30 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " 30 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
31 "cannot disable TSC.\n"); 31 "cannot disable TSC completely.\n");
32 mark_tsc_unstable("user disabled TSC");
32 return 1; 33 return 1;
33} 34}
34#else 35#else
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3f8242774580..edff4c985485 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -44,11 +44,6 @@
44 44
45#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) 45#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
46#define __syscall_clobber "r11","cx","memory" 46#define __syscall_clobber "r11","cx","memory"
47#define __pa_vsymbol(x) \
48 ({unsigned long v; \
49 extern char __vsyscall_0; \
50 asm("" : "=r" (v) : "0" (x)); \
51 ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); })
52 47
53/* 48/*
54 * vsyscall_gtod_data contains data that is : 49 * vsyscall_gtod_data contains data that is :
@@ -102,7 +97,7 @@ static __always_inline void do_get_tz(struct timezone * tz)
102static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) 97static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
103{ 98{
104 int ret; 99 int ret;
105 asm volatile("vsysc2: syscall" 100 asm volatile("syscall"
106 : "=a" (ret) 101 : "=a" (ret)
107 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) 102 : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
108 : __syscall_clobber ); 103 : __syscall_clobber );
@@ -112,7 +107,7 @@ static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
112static __always_inline long time_syscall(long *t) 107static __always_inline long time_syscall(long *t)
113{ 108{
114 long secs; 109 long secs;
115 asm volatile("vsysc1: syscall" 110 asm volatile("syscall"
116 : "=a" (secs) 111 : "=a" (secs)
117 : "0" (__NR_time),"D" (t) : __syscall_clobber); 112 : "0" (__NR_time),"D" (t) : __syscall_clobber);
118 return secs; 113 return secs;
@@ -228,42 +223,11 @@ long __vsyscall(3) venosys_1(void)
228 223
229#ifdef CONFIG_SYSCTL 224#ifdef CONFIG_SYSCTL
230 225
231#define SYSCALL 0x050f 226static int
232#define NOP2 0x9090 227vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
233 228 void __user *buffer, size_t *lenp, loff_t *ppos)
234/*
235 * NOP out syscall in vsyscall page when not needed.
236 */
237static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
238 void __user *buffer, size_t *lenp, loff_t *ppos)
239{ 229{
240 extern u16 vsysc1, vsysc2; 230 return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
241 u16 __iomem *map1;
242 u16 __iomem *map2;
243 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
244 if (!write)
245 return ret;
246 /* gcc has some trouble with __va(__pa()), so just do it this
247 way. */
248 map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
249 if (!map1)
250 return -ENOMEM;
251 map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
252 if (!map2) {
253 ret = -ENOMEM;
254 goto out;
255 }
256 if (!vsyscall_gtod_data.sysctl_enabled) {
257 writew(SYSCALL, map1);
258 writew(SYSCALL, map2);
259 } else {
260 writew(NOP2, map1);
261 writew(NOP2, map2);
262 }
263 iounmap(map2);
264out:
265 iounmap(map1);
266 return ret;
267} 231}
268 232
269static ctl_table kernel_table2[] = { 233static ctl_table kernel_table2[] = {
@@ -279,7 +243,6 @@ static ctl_table kernel_root_table2[] = {
279 .child = kernel_table2 }, 243 .child = kernel_table2 },
280 {} 244 {}
281}; 245};
282
283#endif 246#endif
284 247
285/* Assume __initcall executes before all user space. Hopefully kmod 248/* Assume __initcall executes before all user space. Hopefully kmod
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2cbee9479ce4..68a6b1511934 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -647,6 +647,10 @@ static void start_apic_timer(struct kvm_lapic *apic)
647 apic->timer.period = apic_get_reg(apic, APIC_TMICT) * 647 apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
648 APIC_BUS_CYCLE_NS * apic->timer.divide_count; 648 APIC_BUS_CYCLE_NS * apic->timer.divide_count;
649 atomic_set(&apic->timer.pending, 0); 649 atomic_set(&apic->timer.pending, 0);
650
651 if (!apic->timer.period)
652 return;
653
650 hrtimer_start(&apic->timer.dev, 654 hrtimer_start(&apic->timer.dev,
651 ktime_add_ns(now, apic->timer.period), 655 ktime_add_ns(now, apic->timer.period),
652 HRTIMER_MODE_ABS); 656 HRTIMER_MODE_ABS);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8efdcdbebb03..d8172aabc660 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -681,8 +681,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
681 unsigned level, 681 unsigned level,
682 int metaphysical, 682 int metaphysical,
683 unsigned access, 683 unsigned access,
684 u64 *parent_pte, 684 u64 *parent_pte)
685 bool *new_page)
686{ 685{
687 union kvm_mmu_page_role role; 686 union kvm_mmu_page_role role;
688 unsigned index; 687 unsigned index;
@@ -722,8 +721,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
722 vcpu->arch.mmu.prefetch_page(vcpu, sp); 721 vcpu->arch.mmu.prefetch_page(vcpu, sp);
723 if (!metaphysical) 722 if (!metaphysical)
724 rmap_write_protect(vcpu->kvm, gfn); 723 rmap_write_protect(vcpu->kvm, gfn);
725 if (new_page)
726 *new_page = 1;
727 return sp; 724 return sp;
728} 725}
729 726
@@ -876,11 +873,18 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
876 873
877struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) 874struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
878{ 875{
876 struct page *page;
877
879 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); 878 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
880 879
881 if (gpa == UNMAPPED_GVA) 880 if (gpa == UNMAPPED_GVA)
882 return NULL; 881 return NULL;
883 return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 882
883 down_read(&current->mm->mmap_sem);
884 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
885 up_read(&current->mm->mmap_sem);
886
887 return page;
884} 888}
885 889
886static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 890static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
@@ -999,8 +1003,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
999 >> PAGE_SHIFT; 1003 >> PAGE_SHIFT;
1000 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, 1004 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
1001 v, level - 1, 1005 v, level - 1,
1002 1, ACC_ALL, &table[index], 1006 1, ACC_ALL, &table[index]);
1003 NULL);
1004 if (!new_table) { 1007 if (!new_table) {
1005 pgprintk("nonpaging_map: ENOMEM\n"); 1008 pgprintk("nonpaging_map: ENOMEM\n");
1006 kvm_release_page_clean(page); 1009 kvm_release_page_clean(page);
@@ -1020,15 +1023,18 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1020 1023
1021 struct page *page; 1024 struct page *page;
1022 1025
1026 down_read(&vcpu->kvm->slots_lock);
1027
1023 down_read(&current->mm->mmap_sem); 1028 down_read(&current->mm->mmap_sem);
1024 page = gfn_to_page(vcpu->kvm, gfn); 1029 page = gfn_to_page(vcpu->kvm, gfn);
1030 up_read(&current->mm->mmap_sem);
1025 1031
1026 spin_lock(&vcpu->kvm->mmu_lock); 1032 spin_lock(&vcpu->kvm->mmu_lock);
1027 kvm_mmu_free_some_pages(vcpu); 1033 kvm_mmu_free_some_pages(vcpu);
1028 r = __nonpaging_map(vcpu, v, write, gfn, page); 1034 r = __nonpaging_map(vcpu, v, write, gfn, page);
1029 spin_unlock(&vcpu->kvm->mmu_lock); 1035 spin_unlock(&vcpu->kvm->mmu_lock);
1030 1036
1031 up_read(&current->mm->mmap_sem); 1037 up_read(&vcpu->kvm->slots_lock);
1032 1038
1033 return r; 1039 return r;
1034} 1040}
@@ -1090,7 +1096,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1090 1096
1091 ASSERT(!VALID_PAGE(root)); 1097 ASSERT(!VALID_PAGE(root));
1092 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 1098 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1093 PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL); 1099 PT64_ROOT_LEVEL, 0, ACC_ALL, NULL);
1094 root = __pa(sp->spt); 1100 root = __pa(sp->spt);
1095 ++sp->root_count; 1101 ++sp->root_count;
1096 vcpu->arch.mmu.root_hpa = root; 1102 vcpu->arch.mmu.root_hpa = root;
@@ -1111,7 +1117,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1111 root_gfn = 0; 1117 root_gfn = 0;
1112 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 1118 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1113 PT32_ROOT_LEVEL, !is_paging(vcpu), 1119 PT32_ROOT_LEVEL, !is_paging(vcpu),
1114 ACC_ALL, NULL, NULL); 1120 ACC_ALL, NULL);
1115 root = __pa(sp->spt); 1121 root = __pa(sp->spt);
1116 ++sp->root_count; 1122 ++sp->root_count;
1117 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 1123 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
@@ -1172,7 +1178,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
1172 1178
1173static void paging_new_cr3(struct kvm_vcpu *vcpu) 1179static void paging_new_cr3(struct kvm_vcpu *vcpu)
1174{ 1180{
1175 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); 1181 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->arch.cr3);
1176 mmu_free_roots(vcpu); 1182 mmu_free_roots(vcpu);
1177} 1183}
1178 1184
@@ -1362,6 +1368,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1362 gfn_t gfn; 1368 gfn_t gfn;
1363 int r; 1369 int r;
1364 u64 gpte = 0; 1370 u64 gpte = 0;
1371 struct page *page;
1365 1372
1366 if (bytes != 4 && bytes != 8) 1373 if (bytes != 4 && bytes != 8)
1367 return; 1374 return;
@@ -1389,6 +1396,11 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1389 if (!is_present_pte(gpte)) 1396 if (!is_present_pte(gpte))
1390 return; 1397 return;
1391 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 1398 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1399
1400 down_read(&current->mm->mmap_sem);
1401 page = gfn_to_page(vcpu->kvm, gfn);
1402 up_read(&current->mm->mmap_sem);
1403
1392 vcpu->arch.update_pte.gfn = gfn; 1404 vcpu->arch.update_pte.gfn = gfn;
1393 vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn); 1405 vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
1394} 1406}
@@ -1496,9 +1508,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1496 gpa_t gpa; 1508 gpa_t gpa;
1497 int r; 1509 int r;
1498 1510
1499 down_read(&current->mm->mmap_sem); 1511 down_read(&vcpu->kvm->slots_lock);
1500 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); 1512 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1501 up_read(&current->mm->mmap_sem); 1513 up_read(&vcpu->kvm->slots_lock);
1502 1514
1503 spin_lock(&vcpu->kvm->mmu_lock); 1515 spin_lock(&vcpu->kvm->mmu_lock);
1504 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1516 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 03ba8608fe0f..ecc0856268c4 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -91,7 +91,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
91 pt_element_t *table; 91 pt_element_t *table;
92 struct page *page; 92 struct page *page;
93 93
94 down_read(&current->mm->mmap_sem);
94 page = gfn_to_page(kvm, table_gfn); 95 page = gfn_to_page(kvm, table_gfn);
96 up_read(&current->mm->mmap_sem);
97
95 table = kmap_atomic(page, KM_USER0); 98 table = kmap_atomic(page, KM_USER0);
96 99
97 ret = CMPXCHG(&table[index], orig_pte, new_pte); 100 ret = CMPXCHG(&table[index], orig_pte, new_pte);
@@ -140,7 +143,7 @@ walk:
140 } 143 }
141#endif 144#endif
142 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || 145 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
143 (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); 146 (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
144 147
145 pt_access = ACC_ALL; 148 pt_access = ACC_ALL;
146 149
@@ -297,7 +300,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
297 u64 shadow_pte; 300 u64 shadow_pte;
298 int metaphysical; 301 int metaphysical;
299 gfn_t table_gfn; 302 gfn_t table_gfn;
300 bool new_page = 0;
301 303
302 shadow_ent = ((u64 *)__va(shadow_addr)) + index; 304 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
303 if (level == PT_PAGE_TABLE_LEVEL) 305 if (level == PT_PAGE_TABLE_LEVEL)
@@ -319,8 +321,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
319 } 321 }
320 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, 322 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
321 metaphysical, access, 323 metaphysical, access,
322 shadow_ent, &new_page); 324 shadow_ent);
323 if (new_page && !metaphysical) { 325 if (!metaphysical) {
324 int r; 326 int r;
325 pt_element_t curr_pte; 327 pt_element_t curr_pte;
326 r = kvm_read_guest_atomic(vcpu->kvm, 328 r = kvm_read_guest_atomic(vcpu->kvm,
@@ -378,7 +380,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
378 if (r) 380 if (r)
379 return r; 381 return r;
380 382
381 down_read(&current->mm->mmap_sem); 383 down_read(&vcpu->kvm->slots_lock);
382 /* 384 /*
383 * Look up the shadow pte for the faulting address. 385 * Look up the shadow pte for the faulting address.
384 */ 386 */
@@ -392,11 +394,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
392 pgprintk("%s: guest page fault\n", __FUNCTION__); 394 pgprintk("%s: guest page fault\n", __FUNCTION__);
393 inject_page_fault(vcpu, addr, walker.error_code); 395 inject_page_fault(vcpu, addr, walker.error_code);
394 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 396 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
395 up_read(&current->mm->mmap_sem); 397 up_read(&vcpu->kvm->slots_lock);
396 return 0; 398 return 0;
397 } 399 }
398 400
401 down_read(&current->mm->mmap_sem);
399 page = gfn_to_page(vcpu->kvm, walker.gfn); 402 page = gfn_to_page(vcpu->kvm, walker.gfn);
403 up_read(&current->mm->mmap_sem);
400 404
401 spin_lock(&vcpu->kvm->mmu_lock); 405 spin_lock(&vcpu->kvm->mmu_lock);
402 kvm_mmu_free_some_pages(vcpu); 406 kvm_mmu_free_some_pages(vcpu);
@@ -413,14 +417,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
413 */ 417 */
414 if (shadow_pte && is_io_pte(*shadow_pte)) { 418 if (shadow_pte && is_io_pte(*shadow_pte)) {
415 spin_unlock(&vcpu->kvm->mmu_lock); 419 spin_unlock(&vcpu->kvm->mmu_lock);
416 up_read(&current->mm->mmap_sem); 420 up_read(&vcpu->kvm->slots_lock);
417 return 1; 421 return 1;
418 } 422 }
419 423
420 ++vcpu->stat.pf_fixed; 424 ++vcpu->stat.pf_fixed;
421 kvm_mmu_audit(vcpu, "post page fault (fixed)"); 425 kvm_mmu_audit(vcpu, "post page fault (fixed)");
422 spin_unlock(&vcpu->kvm->mmu_lock); 426 spin_unlock(&vcpu->kvm->mmu_lock);
423 up_read(&current->mm->mmap_sem); 427 up_read(&vcpu->kvm->slots_lock);
424 428
425 return write_pt; 429 return write_pt;
426} 430}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index de755cb1431d..1a582f1090e8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -792,6 +792,10 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
792 vcpu->arch.cr0 = cr0; 792 vcpu->arch.cr0 = cr0;
793 cr0 |= X86_CR0_PG | X86_CR0_WP; 793 cr0 |= X86_CR0_PG | X86_CR0_WP;
794 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 794 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
795 if (!vcpu->fpu_active) {
796 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
797 cr0 |= X86_CR0_TS;
798 }
795 svm->vmcb->save.cr0 = cr0; 799 svm->vmcb->save.cr0 = cr0;
796} 800}
797 801
@@ -1096,6 +1100,24 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1096 case MSR_IA32_SYSENTER_ESP: 1100 case MSR_IA32_SYSENTER_ESP:
1097 *data = svm->vmcb->save.sysenter_esp; 1101 *data = svm->vmcb->save.sysenter_esp;
1098 break; 1102 break;
1103 /* Nobody will change the following 5 values in the VMCB so
1104 we can safely return them on rdmsr. They will always be 0
1105 until LBRV is implemented. */
1106 case MSR_IA32_DEBUGCTLMSR:
1107 *data = svm->vmcb->save.dbgctl;
1108 break;
1109 case MSR_IA32_LASTBRANCHFROMIP:
1110 *data = svm->vmcb->save.br_from;
1111 break;
1112 case MSR_IA32_LASTBRANCHTOIP:
1113 *data = svm->vmcb->save.br_to;
1114 break;
1115 case MSR_IA32_LASTINTFROMIP:
1116 *data = svm->vmcb->save.last_excp_from;
1117 break;
1118 case MSR_IA32_LASTINTTOIP:
1119 *data = svm->vmcb->save.last_excp_to;
1120 break;
1099 default: 1121 default:
1100 return kvm_get_msr_common(vcpu, ecx, data); 1122 return kvm_get_msr_common(vcpu, ecx, data);
1101 } 1123 }
@@ -1156,6 +1178,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1156 case MSR_IA32_SYSENTER_ESP: 1178 case MSR_IA32_SYSENTER_ESP:
1157 svm->vmcb->save.sysenter_esp = data; 1179 svm->vmcb->save.sysenter_esp = data;
1158 break; 1180 break;
1181 case MSR_IA32_DEBUGCTLMSR:
1182 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
1183 __FUNCTION__, data);
1184 break;
1159 case MSR_K7_EVNTSEL0: 1185 case MSR_K7_EVNTSEL0:
1160 case MSR_K7_EVNTSEL1: 1186 case MSR_K7_EVNTSEL1:
1161 case MSR_K7_EVNTSEL2: 1187 case MSR_K7_EVNTSEL2:
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ad36447e696e..94ea724638fd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -638,6 +638,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
638{ 638{
639 int save_nmsrs; 639 int save_nmsrs;
640 640
641 vmx_load_host_state(vmx);
641 save_nmsrs = 0; 642 save_nmsrs = 0;
642#ifdef CONFIG_X86_64 643#ifdef CONFIG_X86_64
643 if (is_long_mode(&vmx->vcpu)) { 644 if (is_long_mode(&vmx->vcpu)) {
@@ -1477,7 +1478,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
1477 struct kvm_userspace_memory_region kvm_userspace_mem; 1478 struct kvm_userspace_memory_region kvm_userspace_mem;
1478 int r = 0; 1479 int r = 0;
1479 1480
1480 down_write(&current->mm->mmap_sem); 1481 down_write(&kvm->slots_lock);
1481 if (kvm->arch.apic_access_page) 1482 if (kvm->arch.apic_access_page)
1482 goto out; 1483 goto out;
1483 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; 1484 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -1487,9 +1488,12 @@ static int alloc_apic_access_page(struct kvm *kvm)
1487 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); 1488 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
1488 if (r) 1489 if (r)
1489 goto out; 1490 goto out;
1491
1492 down_read(&current->mm->mmap_sem);
1490 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); 1493 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
1494 up_read(&current->mm->mmap_sem);
1491out: 1495out:
1492 up_write(&current->mm->mmap_sem); 1496 up_write(&kvm->slots_lock);
1493 return r; 1497 return r;
1494} 1498}
1495 1499
@@ -1602,9 +1606,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1602 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 1606 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
1603 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); 1607 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
1604 1608
1605 if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1606 if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
1607 return -ENOMEM;
1608 1609
1609 return 0; 1610 return 0;
1610} 1611}
@@ -2534,6 +2535,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2534 put_cpu(); 2535 put_cpu();
2535 if (err) 2536 if (err)
2536 goto free_vmcs; 2537 goto free_vmcs;
2538 if (vm_need_virtualize_apic_accesses(kvm))
2539 if (alloc_apic_access_page(kvm) != 0)
2540 goto free_vmcs;
2537 2541
2538 return &vmx->vcpu; 2542 return &vmx->vcpu;
2539 2543
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cf5308148689..6b01552bd1f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -46,6 +46,9 @@
46#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 46#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
47#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 47#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
48 48
49static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
50 struct kvm_cpuid_entry2 __user *entries);
51
49struct kvm_x86_ops *kvm_x86_ops; 52struct kvm_x86_ops *kvm_x86_ops;
50 53
51struct kvm_stats_debugfs_item debugfs_entries[] = { 54struct kvm_stats_debugfs_item debugfs_entries[] = {
@@ -181,7 +184,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
181 int ret; 184 int ret;
182 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 185 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
183 186
184 down_read(&current->mm->mmap_sem); 187 down_read(&vcpu->kvm->slots_lock);
185 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 188 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
186 offset * sizeof(u64), sizeof(pdpte)); 189 offset * sizeof(u64), sizeof(pdpte));
187 if (ret < 0) { 190 if (ret < 0) {
@@ -198,7 +201,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
198 201
199 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 202 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
200out: 203out:
201 up_read(&current->mm->mmap_sem); 204 up_read(&vcpu->kvm->slots_lock);
202 205
203 return ret; 206 return ret;
204} 207}
@@ -212,13 +215,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
212 if (is_long_mode(vcpu) || !is_pae(vcpu)) 215 if (is_long_mode(vcpu) || !is_pae(vcpu))
213 return false; 216 return false;
214 217
215 down_read(&current->mm->mmap_sem); 218 down_read(&vcpu->kvm->slots_lock);
216 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 219 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
217 if (r < 0) 220 if (r < 0)
218 goto out; 221 goto out;
219 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 222 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
220out: 223out:
221 up_read(&current->mm->mmap_sem); 224 up_read(&vcpu->kvm->slots_lock);
222 225
223 return changed; 226 return changed;
224} 227}
@@ -356,7 +359,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
356 */ 359 */
357 } 360 }
358 361
359 down_read(&current->mm->mmap_sem); 362 down_read(&vcpu->kvm->slots_lock);
360 /* 363 /*
361 * Does the new cr3 value map to physical memory? (Note, we 364 * Does the new cr3 value map to physical memory? (Note, we
362 * catch an invalid cr3 even in real-mode, because it would 365 * catch an invalid cr3 even in real-mode, because it would
@@ -372,7 +375,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
372 vcpu->arch.cr3 = cr3; 375 vcpu->arch.cr3 = cr3;
373 vcpu->arch.mmu.new_cr3(vcpu); 376 vcpu->arch.mmu.new_cr3(vcpu);
374 } 377 }
375 up_read(&current->mm->mmap_sem); 378 up_read(&vcpu->kvm->slots_lock);
376} 379}
377EXPORT_SYMBOL_GPL(set_cr3); 380EXPORT_SYMBOL_GPL(set_cr3);
378 381
@@ -484,6 +487,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
484 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", 487 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
485 __FUNCTION__, data); 488 __FUNCTION__, data);
486 break; 489 break;
490 case MSR_IA32_MCG_CTL:
491 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
492 __FUNCTION__, data);
493 break;
487 case MSR_IA32_UCODE_REV: 494 case MSR_IA32_UCODE_REV:
488 case MSR_IA32_UCODE_WRITE: 495 case MSR_IA32_UCODE_WRITE:
489 case 0x200 ... 0x2ff: /* MTRRs */ 496 case 0x200 ... 0x2ff: /* MTRRs */
@@ -526,6 +533,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
526 case MSR_IA32_MC0_CTL: 533 case MSR_IA32_MC0_CTL:
527 case MSR_IA32_MCG_STATUS: 534 case MSR_IA32_MCG_STATUS:
528 case MSR_IA32_MCG_CAP: 535 case MSR_IA32_MCG_CAP:
536 case MSR_IA32_MCG_CTL:
529 case MSR_IA32_MC0_MISC: 537 case MSR_IA32_MC0_MISC:
530 case MSR_IA32_MC0_MISC+4: 538 case MSR_IA32_MC0_MISC+4:
531 case MSR_IA32_MC0_MISC+8: 539 case MSR_IA32_MC0_MISC+8:
@@ -727,6 +735,24 @@ long kvm_arch_dev_ioctl(struct file *filp,
727 r = 0; 735 r = 0;
728 break; 736 break;
729 } 737 }
738 case KVM_GET_SUPPORTED_CPUID: {
739 struct kvm_cpuid2 __user *cpuid_arg = argp;
740 struct kvm_cpuid2 cpuid;
741
742 r = -EFAULT;
743 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
744 goto out;
745 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
746 cpuid_arg->entries);
747 if (r)
748 goto out;
749
750 r = -EFAULT;
751 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
752 goto out;
753 r = 0;
754 break;
755 }
730 default: 756 default:
731 r = -EINVAL; 757 r = -EINVAL;
732 } 758 }
@@ -974,8 +1000,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
974 put_cpu(); 1000 put_cpu();
975} 1001}
976 1002
977static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm, 1003static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
978 struct kvm_cpuid2 *cpuid,
979 struct kvm_cpuid_entry2 __user *entries) 1004 struct kvm_cpuid_entry2 __user *entries)
980{ 1005{
981 struct kvm_cpuid_entry2 *cpuid_entries; 1006 struct kvm_cpuid_entry2 *cpuid_entries;
@@ -1207,12 +1232,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1207 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 1232 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1208 return -EINVAL; 1233 return -EINVAL;
1209 1234
1210 down_write(&current->mm->mmap_sem); 1235 down_write(&kvm->slots_lock);
1211 1236
1212 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 1237 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1213 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 1238 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1214 1239
1215 up_write(&current->mm->mmap_sem); 1240 up_write(&kvm->slots_lock);
1216 return 0; 1241 return 0;
1217} 1242}
1218 1243
@@ -1261,7 +1286,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1261 < alias->target_phys_addr) 1286 < alias->target_phys_addr)
1262 goto out; 1287 goto out;
1263 1288
1264 down_write(&current->mm->mmap_sem); 1289 down_write(&kvm->slots_lock);
1265 1290
1266 p = &kvm->arch.aliases[alias->slot]; 1291 p = &kvm->arch.aliases[alias->slot];
1267 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1292 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1275,7 +1300,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1275 1300
1276 kvm_mmu_zap_all(kvm); 1301 kvm_mmu_zap_all(kvm);
1277 1302
1278 up_write(&current->mm->mmap_sem); 1303 up_write(&kvm->slots_lock);
1279 1304
1280 return 0; 1305 return 0;
1281 1306
@@ -1351,7 +1376,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1351 struct kvm_memory_slot *memslot; 1376 struct kvm_memory_slot *memslot;
1352 int is_dirty = 0; 1377 int is_dirty = 0;
1353 1378
1354 down_write(&current->mm->mmap_sem); 1379 down_write(&kvm->slots_lock);
1355 1380
1356 r = kvm_get_dirty_log(kvm, log, &is_dirty); 1381 r = kvm_get_dirty_log(kvm, log, &is_dirty);
1357 if (r) 1382 if (r)
@@ -1367,7 +1392,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1367 } 1392 }
1368 r = 0; 1393 r = 0;
1369out: 1394out:
1370 up_write(&current->mm->mmap_sem); 1395 up_write(&kvm->slots_lock);
1371 return r; 1396 return r;
1372} 1397}
1373 1398
@@ -1487,24 +1512,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
1487 r = 0; 1512 r = 0;
1488 break; 1513 break;
1489 } 1514 }
1490 case KVM_GET_SUPPORTED_CPUID: {
1491 struct kvm_cpuid2 __user *cpuid_arg = argp;
1492 struct kvm_cpuid2 cpuid;
1493
1494 r = -EFAULT;
1495 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1496 goto out;
1497 r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
1498 cpuid_arg->entries);
1499 if (r)
1500 goto out;
1501
1502 r = -EFAULT;
1503 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1504 goto out;
1505 r = 0;
1506 break;
1507 }
1508 default: 1515 default:
1509 ; 1516 ;
1510 } 1517 }
@@ -1563,7 +1570,7 @@ int emulator_read_std(unsigned long addr,
1563 void *data = val; 1570 void *data = val;
1564 int r = X86EMUL_CONTINUE; 1571 int r = X86EMUL_CONTINUE;
1565 1572
1566 down_read(&current->mm->mmap_sem); 1573 down_read(&vcpu->kvm->slots_lock);
1567 while (bytes) { 1574 while (bytes) {
1568 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 1575 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1569 unsigned offset = addr & (PAGE_SIZE-1); 1576 unsigned offset = addr & (PAGE_SIZE-1);
@@ -1585,7 +1592,7 @@ int emulator_read_std(unsigned long addr,
1585 addr += tocopy; 1592 addr += tocopy;
1586 } 1593 }
1587out: 1594out:
1588 up_read(&current->mm->mmap_sem); 1595 up_read(&vcpu->kvm->slots_lock);
1589 return r; 1596 return r;
1590} 1597}
1591EXPORT_SYMBOL_GPL(emulator_read_std); 1598EXPORT_SYMBOL_GPL(emulator_read_std);
@@ -1604,9 +1611,9 @@ static int emulator_read_emulated(unsigned long addr,
1604 return X86EMUL_CONTINUE; 1611 return X86EMUL_CONTINUE;
1605 } 1612 }
1606 1613
1607 down_read(&current->mm->mmap_sem); 1614 down_read(&vcpu->kvm->slots_lock);
1608 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 1615 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1609 up_read(&current->mm->mmap_sem); 1616 up_read(&vcpu->kvm->slots_lock);
1610 1617
1611 /* For APIC access vmexit */ 1618 /* For APIC access vmexit */
1612 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 1619 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -1644,14 +1651,14 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1644{ 1651{
1645 int ret; 1652 int ret;
1646 1653
1647 down_read(&current->mm->mmap_sem); 1654 down_read(&vcpu->kvm->slots_lock);
1648 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 1655 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1649 if (ret < 0) { 1656 if (ret < 0) {
1650 up_read(&current->mm->mmap_sem); 1657 up_read(&vcpu->kvm->slots_lock);
1651 return 0; 1658 return 0;
1652 } 1659 }
1653 kvm_mmu_pte_write(vcpu, gpa, val, bytes); 1660 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1654 up_read(&current->mm->mmap_sem); 1661 up_read(&vcpu->kvm->slots_lock);
1655 return 1; 1662 return 1;
1656} 1663}
1657 1664
@@ -1663,9 +1670,9 @@ static int emulator_write_emulated_onepage(unsigned long addr,
1663 struct kvm_io_device *mmio_dev; 1670 struct kvm_io_device *mmio_dev;
1664 gpa_t gpa; 1671 gpa_t gpa;
1665 1672
1666 down_read(&current->mm->mmap_sem); 1673 down_read(&vcpu->kvm->slots_lock);
1667 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 1674 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1668 up_read(&current->mm->mmap_sem); 1675 up_read(&vcpu->kvm->slots_lock);
1669 1676
1670 if (gpa == UNMAPPED_GVA) { 1677 if (gpa == UNMAPPED_GVA) {
1671 kvm_inject_page_fault(vcpu, addr, 2); 1678 kvm_inject_page_fault(vcpu, addr, 2);
@@ -1742,7 +1749,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
1742 char *kaddr; 1749 char *kaddr;
1743 u64 val; 1750 u64 val;
1744 1751
1745 down_read(&current->mm->mmap_sem); 1752 down_read(&vcpu->kvm->slots_lock);
1746 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 1753 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1747 1754
1748 if (gpa == UNMAPPED_GVA || 1755 if (gpa == UNMAPPED_GVA ||
@@ -1753,13 +1760,17 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
1753 goto emul_write; 1760 goto emul_write;
1754 1761
1755 val = *(u64 *)new; 1762 val = *(u64 *)new;
1763
1764 down_read(&current->mm->mmap_sem);
1756 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1765 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1766 up_read(&current->mm->mmap_sem);
1767
1757 kaddr = kmap_atomic(page, KM_USER0); 1768 kaddr = kmap_atomic(page, KM_USER0);
1758 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 1769 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
1759 kunmap_atomic(kaddr, KM_USER0); 1770 kunmap_atomic(kaddr, KM_USER0);
1760 kvm_release_page_dirty(page); 1771 kvm_release_page_dirty(page);
1761 emul_write: 1772 emul_write:
1762 up_read(&current->mm->mmap_sem); 1773 up_read(&vcpu->kvm->slots_lock);
1763 } 1774 }
1764#endif 1775#endif
1765 1776
@@ -2152,10 +2163,10 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2152 kvm_x86_ops->skip_emulated_instruction(vcpu); 2163 kvm_x86_ops->skip_emulated_instruction(vcpu);
2153 2164
2154 for (i = 0; i < nr_pages; ++i) { 2165 for (i = 0; i < nr_pages; ++i) {
2155 down_read(&current->mm->mmap_sem); 2166 down_read(&vcpu->kvm->slots_lock);
2156 page = gva_to_page(vcpu, address + i * PAGE_SIZE); 2167 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
2157 vcpu->arch.pio.guest_pages[i] = page; 2168 vcpu->arch.pio.guest_pages[i] = page;
2158 up_read(&current->mm->mmap_sem); 2169 up_read(&vcpu->kvm->slots_lock);
2159 if (!page) { 2170 if (!page) {
2160 kvm_inject_gp(vcpu, 0); 2171 kvm_inject_gp(vcpu, 0);
2161 free_pio_guest_pages(vcpu); 2172 free_pio_guest_pages(vcpu);
@@ -2478,8 +2489,9 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
2478 2489
2479 down_read(&current->mm->mmap_sem); 2490 down_read(&current->mm->mmap_sem);
2480 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2491 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2481 vcpu->arch.apic->vapic_page = page;
2482 up_read(&current->mm->mmap_sem); 2492 up_read(&current->mm->mmap_sem);
2493
2494 vcpu->arch.apic->vapic_page = page;
2483} 2495}
2484 2496
2485static void vapic_exit(struct kvm_vcpu *vcpu) 2497static void vapic_exit(struct kvm_vcpu *vcpu)
@@ -2861,8 +2873,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2861 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2873 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2862 2874
2863 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; 2875 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
2864 vcpu->arch.cr0 = sregs->cr0;
2865 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 2876 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2877 vcpu->arch.cr0 = sregs->cr0;
2866 2878
2867 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 2879 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
2868 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 2880 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
@@ -2952,9 +2964,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2952 gpa_t gpa; 2964 gpa_t gpa;
2953 2965
2954 vcpu_load(vcpu); 2966 vcpu_load(vcpu);
2955 down_read(&current->mm->mmap_sem); 2967 down_read(&vcpu->kvm->slots_lock);
2956 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 2968 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
2957 up_read(&current->mm->mmap_sem); 2969 up_read(&vcpu->kvm->slots_lock);
2958 tr->physical_address = gpa; 2970 tr->physical_address = gpa;
2959 tr->valid = gpa != UNMAPPED_GVA; 2971 tr->valid = gpa != UNMAPPED_GVA;
2960 tr->writeable = 1; 2972 tr->writeable = 1;
@@ -3227,11 +3239,13 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
3227 */ 3239 */
3228 if (!user_alloc) { 3240 if (!user_alloc) {
3229 if (npages && !old.rmap) { 3241 if (npages && !old.rmap) {
3242 down_write(&current->mm->mmap_sem);
3230 memslot->userspace_addr = do_mmap(NULL, 0, 3243 memslot->userspace_addr = do_mmap(NULL, 0,
3231 npages * PAGE_SIZE, 3244 npages * PAGE_SIZE,
3232 PROT_READ | PROT_WRITE, 3245 PROT_READ | PROT_WRITE,
3233 MAP_SHARED | MAP_ANONYMOUS, 3246 MAP_SHARED | MAP_ANONYMOUS,
3234 0); 3247 0);
3248 up_write(&current->mm->mmap_sem);
3235 3249
3236 if (IS_ERR((void *)memslot->userspace_addr)) 3250 if (IS_ERR((void *)memslot->userspace_addr))
3237 return PTR_ERR((void *)memslot->userspace_addr); 3251 return PTR_ERR((void *)memslot->userspace_addr);
@@ -3239,8 +3253,10 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
3239 if (!old.user_alloc && old.rmap) { 3253 if (!old.user_alloc && old.rmap) {
3240 int ret; 3254 int ret;
3241 3255
3256 down_write(&current->mm->mmap_sem);
3242 ret = do_munmap(current->mm, old.userspace_addr, 3257 ret = do_munmap(current->mm, old.userspace_addr,
3243 old.npages * PAGE_SIZE); 3258 old.npages * PAGE_SIZE);
3259 up_write(&current->mm->mmap_sem);
3244 if (ret < 0) 3260 if (ret < 0)
3245 printk(KERN_WARNING 3261 printk(KERN_WARNING
3246 "kvm_vm_ioctl_set_memory_region: " 3262 "kvm_vm_ioctl_set_memory_region: "
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 5afdde4895dc..a104c532ff70 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -57,6 +57,7 @@
57#include <linux/lguest_launcher.h> 57#include <linux/lguest_launcher.h>
58#include <linux/virtio_console.h> 58#include <linux/virtio_console.h>
59#include <linux/pm.h> 59#include <linux/pm.h>
60#include <asm/lguest.h>
60#include <asm/paravirt.h> 61#include <asm/paravirt.h>
61#include <asm/param.h> 62#include <asm/param.h>
62#include <asm/page.h> 63#include <asm/page.h>
@@ -75,15 +76,6 @@
75 * behaving in simplified but equivalent ways. In particular, the Guest is the 76 * behaving in simplified but equivalent ways. In particular, the Guest is the
76 * same kernel as the Host (or at least, built from the same source code). :*/ 77 * same kernel as the Host (or at least, built from the same source code). :*/
77 78
78/* Declarations for definitions in lguest_guest.S */
79extern char lguest_noirq_start[], lguest_noirq_end[];
80extern const char lgstart_cli[], lgend_cli[];
81extern const char lgstart_sti[], lgend_sti[];
82extern const char lgstart_popf[], lgend_popf[];
83extern const char lgstart_pushf[], lgend_pushf[];
84extern const char lgstart_iret[], lgend_iret[];
85extern void lguest_iret(void);
86
87struct lguest_data lguest_data = { 79struct lguest_data lguest_data = {
88 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, 80 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
89 .noirq_start = (u32)lguest_noirq_start, 81 .noirq_start = (u32)lguest_noirq_start,
@@ -92,7 +84,6 @@ struct lguest_data lguest_data = {
92 .blocked_interrupts = { 1 }, /* Block timer interrupts */ 84 .blocked_interrupts = { 1 }, /* Block timer interrupts */
93 .syscall_vec = SYSCALL_VECTOR, 85 .syscall_vec = SYSCALL_VECTOR,
94}; 86};
95static cycle_t clock_base;
96 87
97/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a 88/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a
98 * ring buffer of stored hypercalls which the Host will run though next time we 89 * ring buffer of stored hypercalls which the Host will run though next time we
@@ -335,8 +326,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
335 case 1: /* Basic feature request. */ 326 case 1: /* Basic feature request. */
336 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 327 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
337 *cx &= 0x00002201; 328 *cx &= 0x00002201;
338 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ 329 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */
339 *dx &= 0x07808101; 330 *dx &= 0x07808111;
340 /* The Host can do a nice optimization if it knows that the 331 /* The Host can do a nice optimization if it knows that the
341 * kernel mappings (addresses above 0xC0000000 or whatever 332 * kernel mappings (addresses above 0xC0000000 or whatever
342 * PAGE_OFFSET is set to) haven't changed. But Linux calls 333 * PAGE_OFFSET is set to) haven't changed. But Linux calls
@@ -603,19 +594,25 @@ static unsigned long lguest_get_wallclock(void)
603 return lguest_data.time.tv_sec; 594 return lguest_data.time.tv_sec;
604} 595}
605 596
597/* The TSC is a Time Stamp Counter. The Host tells us what speed it runs at,
598 * or 0 if it's unusable as a reliable clock source. This matches what we want
599 * here: if we return 0 from this function, the x86 TSC clock will not register
600 * itself. */
601static unsigned long lguest_cpu_khz(void)
602{
603 return lguest_data.tsc_khz;
604}
605
606/* If we can't use the TSC, the kernel falls back to our "lguest_clock", where
607 * we read the time value given to us by the Host. */
606static cycle_t lguest_clock_read(void) 608static cycle_t lguest_clock_read(void)
607{ 609{
608 unsigned long sec, nsec; 610 unsigned long sec, nsec;
609 611
610 /* If the Host tells the TSC speed, we can trust that. */ 612 /* Since the time is in two parts (seconds and nanoseconds), we risk
611 if (lguest_data.tsc_khz) 613 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
612 return native_read_tsc(); 614 * and getting 99 and 0. As Linux tends to come apart under the stress
613 615 * of time travel, we must be careful: */
614 /* If we can't use the TSC, we read the time value written by the Host.
615 * Since it's in two parts (seconds and nanoseconds), we risk reading
616 * it just as it's changing from 99 & 0.999999999 to 100 and 0, and
617 * getting 99 and 0. As Linux tends to come apart under the stress of
618 * time travel, we must be careful: */
619 do { 616 do {
620 /* First we read the seconds part. */ 617 /* First we read the seconds part. */
621 sec = lguest_data.time.tv_sec; 618 sec = lguest_data.time.tv_sec;
@@ -630,14 +627,14 @@ static cycle_t lguest_clock_read(void)
630 /* Now if the seconds part has changed, try again. */ 627 /* Now if the seconds part has changed, try again. */
631 } while (unlikely(lguest_data.time.tv_sec != sec)); 628 } while (unlikely(lguest_data.time.tv_sec != sec));
632 629
633 /* Our non-TSC clock is in real nanoseconds. */ 630 /* Our lguest clock is in real nanoseconds. */
634 return sec*1000000000ULL + nsec; 631 return sec*1000000000ULL + nsec;
635} 632}
636 633
637/* This is what we tell the kernel is our clocksource. */ 634/* This is the fallback clocksource: lower priority than the TSC clocksource. */
638static struct clocksource lguest_clock = { 635static struct clocksource lguest_clock = {
639 .name = "lguest", 636 .name = "lguest",
640 .rating = 400, 637 .rating = 200,
641 .read = lguest_clock_read, 638 .read = lguest_clock_read,
642 .mask = CLOCKSOURCE_MASK(64), 639 .mask = CLOCKSOURCE_MASK(64),
643 .mult = 1 << 22, 640 .mult = 1 << 22,
@@ -645,12 +642,6 @@ static struct clocksource lguest_clock = {
645 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 642 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
646}; 643};
647 644
648/* The "scheduler clock" is just our real clock, adjusted to start at zero */
649static unsigned long long lguest_sched_clock(void)
650{
651 return cyc2ns(&lguest_clock, lguest_clock_read() - clock_base);
652}
653
654/* We also need a "struct clock_event_device": Linux asks us to set it to go 645/* We also need a "struct clock_event_device": Linux asks us to set it to go
655 * off some time in the future. Actually, James Morris figured all this out, I 646 * off some time in the future. Actually, James Morris figured all this out, I
656 * just applied the patch. */ 647 * just applied the patch. */
@@ -720,19 +711,8 @@ static void lguest_time_init(void)
720 /* Set up the timer interrupt (0) to go to our simple timer routine */ 711 /* Set up the timer interrupt (0) to go to our simple timer routine */
721 set_irq_handler(0, lguest_time_irq); 712 set_irq_handler(0, lguest_time_irq);
722 713
723 /* Our clock structure looks like arch/x86/kernel/tsc_32.c if we can
724 * use the TSC, otherwise it's a dumb nanosecond-resolution clock.
725 * Either way, the "rating" is set so high that it's always chosen over
726 * any other clocksource. */
727 if (lguest_data.tsc_khz)
728 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
729 lguest_clock.shift);
730 clock_base = lguest_clock_read();
731 clocksource_register(&lguest_clock); 714 clocksource_register(&lguest_clock);
732 715
733 /* Now we've set up our clock, we can use it as the scheduler clock */
734 pv_time_ops.sched_clock = lguest_sched_clock;
735
736 /* We can't set cpumask in the initializer: damn C limitations! Set it 716 /* We can't set cpumask in the initializer: damn C limitations! Set it
737 * here and register our timer device. */ 717 * here and register our timer device. */
738 lguest_clockevent.cpumask = cpumask_of_cpu(0); 718 lguest_clockevent.cpumask = cpumask_of_cpu(0);
@@ -1003,6 +983,7 @@ __init void lguest_init(void)
1003 /* time operations */ 983 /* time operations */
1004 pv_time_ops.get_wallclock = lguest_get_wallclock; 984 pv_time_ops.get_wallclock = lguest_get_wallclock;
1005 pv_time_ops.time_init = lguest_time_init; 985 pv_time_ops.time_init = lguest_time_init;
986 pv_time_ops.get_cpu_khz = lguest_cpu_khz;
1006 987
1007 /* Now is a good time to look at the implementations of these functions 988 /* Now is a good time to look at the implementations of these functions
1008 * before returning to the rest of lguest_init(). */ 989 * before returning to the rest of lguest_init(). */
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bb652f5a93fb..a02a14f0f324 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -172,8 +172,9 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
172} 172}
173 173
174/* 174/*
175 * The head.S code sets up the kernel high mapping from: 175 * The head.S code sets up the kernel high mapping:
176 * __START_KERNEL_map to __START_KERNEL_map + KERNEL_TEXT_SIZE 176 *
177 * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
177 * 178 *
178 * phys_addr holds the negative offset to the kernel, which is added 179 * phys_addr holds the negative offset to the kernel, which is added
179 * to the compile time generated pmds. This results in invalid pmds up 180 * to the compile time generated pmds. This results in invalid pmds up
@@ -515,14 +516,6 @@ void __init mem_init(void)
515 516
516 /* clear_bss() already clear the empty_zero_page */ 517 /* clear_bss() already clear the empty_zero_page */
517 518
518 /* temporary debugging - double check it's true: */
519 {
520 int i;
521
522 for (i = 0; i < 1024; i++)
523 WARN_ON_ONCE(empty_zero_page[i]);
524 }
525
526 reservedpages = 0; 519 reservedpages = 0;
527 520
528 /* this will put all low memory onto the freelists */ 521 /* this will put all low memory onto the freelists */
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 882328efc3db..8fe576baa148 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -134,8 +134,6 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
134 return NULL; 134 return NULL;
135 } 135 }
136 136
137 WARN_ON_ONCE(page_is_ram(pfn));
138
139 switch (mode) { 137 switch (mode) {
140 case IOR_MODE_UNCACHED: 138 case IOR_MODE_UNCACHED:
141 default: 139 default:
@@ -162,7 +160,7 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
162 area->phys_addr = phys_addr; 160 area->phys_addr = phys_addr;
163 vaddr = (unsigned long) area->addr; 161 vaddr = (unsigned long) area->addr;
164 if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { 162 if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
165 remove_vm_area((void *)(vaddr & PAGE_MASK)); 163 free_vm_area(area);
166 return NULL; 164 return NULL;
167 } 165 }
168 166
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 59898fb0a4aa..8ccfee10f5b5 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -622,13 +622,17 @@ void __init init_cpu_to_node(void)
622 int i; 622 int i;
623 623
624 for (i = 0; i < NR_CPUS; i++) { 624 for (i = 0; i < NR_CPUS; i++) {
625 int node;
625 u16 apicid = x86_cpu_to_apicid_init[i]; 626 u16 apicid = x86_cpu_to_apicid_init[i];
626 627
627 if (apicid == BAD_APICID) 628 if (apicid == BAD_APICID)
628 continue; 629 continue;
629 if (apicid_to_node[apicid] == NUMA_NO_NODE) 630 node = apicid_to_node[apicid];
631 if (node == NUMA_NO_NODE)
630 continue; 632 continue;
631 numa_set_node(i, apicid_to_node[apicid]); 633 if (!node_online(node))
634 continue;
635 numa_set_node(i, node);
632 } 636 }
633} 637}
634 638
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 464d8fc21ce6..14e48b5a94ba 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -44,6 +44,12 @@ static inline unsigned long highmap_end_pfn(void)
44 44
45#endif 45#endif
46 46
47#ifdef CONFIG_DEBUG_PAGEALLOC
48# define debug_pagealloc 1
49#else
50# define debug_pagealloc 0
51#endif
52
47static inline int 53static inline int
48within(unsigned long addr, unsigned long start, unsigned long end) 54within(unsigned long addr, unsigned long start, unsigned long end)
49{ 55{
@@ -355,45 +361,48 @@ out_unlock:
355 361
356static LIST_HEAD(page_pool); 362static LIST_HEAD(page_pool);
357static unsigned long pool_size, pool_pages, pool_low; 363static unsigned long pool_size, pool_pages, pool_low;
358static unsigned long pool_used, pool_failed, pool_refill; 364static unsigned long pool_used, pool_failed;
359 365
360static void cpa_fill_pool(void) 366static void cpa_fill_pool(struct page **ret)
361{ 367{
362 struct page *p;
363 gfp_t gfp = GFP_KERNEL; 368 gfp_t gfp = GFP_KERNEL;
369 unsigned long flags;
370 struct page *p;
364 371
365 /* Do not allocate from interrupt context */
366 if (in_irq() || irqs_disabled())
367 return;
368 /* 372 /*
369 * Check unlocked. I does not matter when we have one more 373 * Avoid recursion (on debug-pagealloc) and also signal
370 * page in the pool. The bit lock avoids recursive pool 374 * our priority to get to these pagetables:
371 * allocations:
372 */ 375 */
373 if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill)) 376 if (current->flags & PF_MEMALLOC)
374 return; 377 return;
378 current->flags |= PF_MEMALLOC;
375 379
376#ifdef CONFIG_DEBUG_PAGEALLOC
377 /* 380 /*
378 * We could do: 381 * Allocate atomically from atomic contexts:
379 * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL;
380 * but this fails on !PREEMPT kernels
381 */ 382 */
382 gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; 383 if (in_atomic() || irqs_disabled() || debug_pagealloc)
383#endif 384 gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
384 385
385 while (pool_pages < pool_size) { 386 while (pool_pages < pool_size || (ret && !*ret)) {
386 p = alloc_pages(gfp, 0); 387 p = alloc_pages(gfp, 0);
387 if (!p) { 388 if (!p) {
388 pool_failed++; 389 pool_failed++;
389 break; 390 break;
390 } 391 }
391 spin_lock_irq(&pgd_lock); 392 /*
393 * If the call site needs a page right now, provide it:
394 */
395 if (ret && !*ret) {
396 *ret = p;
397 continue;
398 }
399 spin_lock_irqsave(&pgd_lock, flags);
392 list_add(&p->lru, &page_pool); 400 list_add(&p->lru, &page_pool);
393 pool_pages++; 401 pool_pages++;
394 spin_unlock_irq(&pgd_lock); 402 spin_unlock_irqrestore(&pgd_lock, flags);
395 } 403 }
396 clear_bit_unlock(0, &pool_refill); 404
405 current->flags &= ~PF_MEMALLOC;
397} 406}
398 407
399#define SHIFT_MB (20 - PAGE_SHIFT) 408#define SHIFT_MB (20 - PAGE_SHIFT)
@@ -414,11 +423,15 @@ void __init cpa_init(void)
414 * GiB. Shift MiB to Gib and multiply the result by 423 * GiB. Shift MiB to Gib and multiply the result by
415 * POOL_PAGES_PER_GB: 424 * POOL_PAGES_PER_GB:
416 */ 425 */
417 gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; 426 if (debug_pagealloc) {
418 pool_size = POOL_PAGES_PER_GB * gb; 427 gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
428 pool_size = POOL_PAGES_PER_GB * gb;
429 } else {
430 pool_size = 1;
431 }
419 pool_low = pool_size; 432 pool_low = pool_size;
420 433
421 cpa_fill_pool(); 434 cpa_fill_pool(NULL);
422 printk(KERN_DEBUG 435 printk(KERN_DEBUG
423 "CPA: page pool initialized %lu of %lu pages preallocated\n", 436 "CPA: page pool initialized %lu of %lu pages preallocated\n",
424 pool_pages, pool_size); 437 pool_pages, pool_size);
@@ -440,16 +453,20 @@ static int split_large_page(pte_t *kpte, unsigned long address)
440 spin_lock_irqsave(&pgd_lock, flags); 453 spin_lock_irqsave(&pgd_lock, flags);
441 if (list_empty(&page_pool)) { 454 if (list_empty(&page_pool)) {
442 spin_unlock_irqrestore(&pgd_lock, flags); 455 spin_unlock_irqrestore(&pgd_lock, flags);
443 return -ENOMEM; 456 base = NULL;
457 cpa_fill_pool(&base);
458 if (!base)
459 return -ENOMEM;
460 spin_lock_irqsave(&pgd_lock, flags);
461 } else {
462 base = list_first_entry(&page_pool, struct page, lru);
463 list_del(&base->lru);
464 pool_pages--;
465
466 if (pool_pages < pool_low)
467 pool_low = pool_pages;
444 } 468 }
445 469
446 base = list_first_entry(&page_pool, struct page, lru);
447 list_del(&base->lru);
448 pool_pages--;
449
450 if (pool_pages < pool_low)
451 pool_low = pool_pages;
452
453 /* 470 /*
454 * Check for races, another CPU might have split this page 471 * Check for races, another CPU might have split this page
455 * up for us already: 472 * up for us already:
@@ -734,7 +751,8 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
734 cpa_flush_all(cache); 751 cpa_flush_all(cache);
735 752
736out: 753out:
737 cpa_fill_pool(); 754 cpa_fill_pool(NULL);
755
738 return ret; 756 return ret;
739} 757}
740 758
@@ -897,7 +915,7 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
897 * Try to refill the page pool here. We can do this only after 915 * Try to refill the page pool here. We can do this only after
898 * the tlb flush. 916 * the tlb flush.
899 */ 917 */
900 cpa_fill_pool(); 918 cpa_fill_pool(NULL);
901} 919}
902 920
903#ifdef CONFIG_HIBERNATION 921#ifdef CONFIG_HIBERNATION
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 73aba7125203..2f9e9afcb9f4 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -342,12 +342,16 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
342 342
343pgd_t *pgd_alloc(struct mm_struct *mm) 343pgd_t *pgd_alloc(struct mm_struct *mm)
344{ 344{
345 pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor); 345 pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
346 346
347 mm->pgd = pgd; /* so that alloc_pd can use it */ 347 /* so that alloc_pd can use it */
348 mm->pgd = pgd;
349 if (pgd)
350 pgd_ctor(pgd);
348 351
349 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { 352 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
350 quicklist_free(0, pgd_dtor, pgd); 353 pgd_dtor(pgd);
354 free_page((unsigned long)pgd);
351 pgd = NULL; 355 pgd = NULL;
352 } 356 }
353 357
@@ -357,12 +361,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
357void pgd_free(struct mm_struct *mm, pgd_t *pgd) 361void pgd_free(struct mm_struct *mm, pgd_t *pgd)
358{ 362{
359 pgd_mop_up_pmds(mm, pgd); 363 pgd_mop_up_pmds(mm, pgd);
360 quicklist_free(0, pgd_dtor, pgd); 364 pgd_dtor(pgd);
361} 365 free_page((unsigned long)pgd);
362
363void check_pgt_cache(void)
364{
365 quicklist_trim(0, pgd_dtor, 25, 16);
366} 366}
367 367
368void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) 368void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 10ac8c316c46..2f7109ac4c15 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -198,6 +198,11 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
198 "b" (bx), 198 "b" (bx),
199 "D" ((long)reg), 199 "D" ((long)reg),
200 "S" (&pci_indirect)); 200 "S" (&pci_indirect));
201 /*
202 * Zero-extend the result beyond 8 bits, do not trust the
203 * BIOS having done it:
204 */
205 *value &= 0xff;
201 break; 206 break;
202 case 2: 207 case 2:
203 __asm__("lcall *(%%esi); cld\n\t" 208 __asm__("lcall *(%%esi); cld\n\t"
@@ -210,6 +215,11 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
210 "b" (bx), 215 "b" (bx),
211 "D" ((long)reg), 216 "D" ((long)reg),
212 "S" (&pci_indirect)); 217 "S" (&pci_indirect));
218 /*
219 * Zero-extend the result beyond 16 bits, do not trust the
220 * BIOS having done it:
221 */
222 *value &= 0xffff;
213 break; 223 break;
214 case 4: 224 case 4:
215 __asm__("lcall *(%%esi); cld\n\t" 225 __asm__("lcall *(%%esi); cld\n\t"
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index f385a4b4a484..0a8f4742ef51 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -50,7 +50,9 @@ obj-$(VDSO64-y) += vdso-syms.lds
50sed-vdsosym := -e 's/^00*/0/' \ 50sed-vdsosym := -e 's/^00*/0/' \
51 -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p' 51 -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p'
52quiet_cmd_vdsosym = VDSOSYM $@ 52quiet_cmd_vdsosym = VDSOSYM $@
53 cmd_vdsosym = $(NM) $< | sed -n $(sed-vdsosym) | LC_ALL=C sort > $@ 53define cmd_vdsosym
54 $(NM) $< | LC_ALL=C sed -n $(sed-vdsosym) | LC_ALL=C sort > $@
55endef
54 56
55$(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE 57$(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
56 $(call if_changed,vdsosym) 58 $(call if_changed,vdsosym)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 49e5358f481a..8b9ee27805fd 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -153,6 +153,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
153 if (*ax == 1) 153 if (*ax == 1)
154 maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ 154 maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
155 (1 << X86_FEATURE_ACPI) | /* disable ACPI */ 155 (1 << X86_FEATURE_ACPI) | /* disable ACPI */
156 (1 << X86_FEATURE_SEP) | /* disable SEP */
156 (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 157 (1 << X86_FEATURE_ACC)); /* thermal monitoring */
157 158
158 asm(XEN_EMULATE_PREFIX "cpuid" 159 asm(XEN_EMULATE_PREFIX "cpuid"
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 3bad4773a2f3..2341492bf7a0 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -38,7 +38,8 @@ char * __init xen_memory_setup(void)
38 unsigned long max_pfn = xen_start_info->nr_pages; 38 unsigned long max_pfn = xen_start_info->nr_pages;
39 39
40 e820.nr_map = 0; 40 e820.nr_map = 0;
41 add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM); 41 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
42 add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM);
42 43
43 return "Xen"; 44 return "Xen";
44} 45}