aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-06-25 06:26:59 -0400
committerIngo Molnar <mingo@elte.hu>2008-06-25 06:26:59 -0400
commitf57aec5a871907427060196f6bac3d0011b38450 (patch)
treefec11fec6cf8e2454e07bd04ac137cad85075611
parent1de8644cc7c826e0c41e52825bd5a12e2e31e6ca (diff)
parent543cf4cb3fe6f6cae3651ba918b9c56200b257d0 (diff)
Merge branch 'linus' into sched/devel
Conflicts: kernel/sched_rt.c Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--Documentation/DocBook/kgdb.tmpl20
-rw-r--r--Makefile2
-rw-r--r--arch/ia64/kernel/iosapic.c2
-rw-r--r--arch/ia64/kernel/setup.c3
-rw-r--r--arch/ia64/sn/kernel/sn2/sn2_smp.c2
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/kvmclock.c89
-rw-r--r--arch/x86/kernel/pvclock.c141
-rw-r--r--arch/x86/kvm/i8254.c9
-rw-r--r--arch/x86/kvm/lapic.c1
-rw-r--r--arch/x86/kvm/mmu.c19
-rw-r--r--arch/x86/kvm/vmx.c19
-rw-r--r--arch/x86/kvm/x86.c91
-rw-r--r--arch/x86/xen/Kconfig3
-rw-r--r--arch/x86/xen/enlighten.c56
-rw-r--r--arch/x86/xen/mmu.c75
-rw-r--r--arch/x86/xen/mmu.h24
-rw-r--r--arch/x86/xen/time.c132
-rw-r--r--arch/x86/xen/xen-head.S6
-rw-r--r--drivers/char/drm/i915_drv.c1
-rw-r--r--drivers/char/tty_ioctl.c7
-rw-r--r--drivers/infiniband/hw/mthca/mthca_memfree.c6
-rw-r--r--drivers/lguest/x86/core.c15
-rw-r--r--drivers/watchdog/Makefile1
-rw-r--r--drivers/xen/events.c2
-rw-r--r--fs/gfs2/bmap.c23
-rw-r--r--fs/gfs2/rgrp.c2
-rw-r--r--fs/nfs/mount_clnt.c5
-rw-r--r--fs/nfs/super.c76
-rw-r--r--fs/nfs/write.c7
-rw-r--r--fs/select.c2
-rw-r--r--include/asm-alpha/percpu.h2
-rw-r--r--include/asm-x86/kvm_host.h4
-rw-r--r--include/asm-x86/kvm_para.h18
-rw-r--r--include/asm-x86/pvclock-abi.h42
-rw-r--r--include/asm-x86/pvclock.h13
-rw-r--r--include/asm-x86/xen/page.h4
-rw-r--r--include/linux/kvm_host.h1
-rw-r--r--include/linux/tty_driver.h5
-rw-r--r--include/xen/interface/xen.h7
-rw-r--r--kernel/futex.c93
-rw-r--r--kernel/kgdb.c3
-rw-r--r--kernel/sched.c14
-rw-r--r--mm/memory.c50
-rw-r--r--sound/isa/sb/sb_mixer.c4
-rw-r--r--sound/pci/aw2/aw2-alsa.c4
-rw-r--r--virt/kvm/ioapic.c31
48 files changed, 644 insertions, 498 deletions
diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl
index 028a8444d95e..e8acd1f03456 100644
--- a/Documentation/DocBook/kgdb.tmpl
+++ b/Documentation/DocBook/kgdb.tmpl
@@ -84,10 +84,9 @@
84 runs an instance of gdb against the vmlinux file which contains 84 runs an instance of gdb against the vmlinux file which contains
85 the symbols (not boot image such as bzImage, zImage, uImage...). 85 the symbols (not boot image such as bzImage, zImage, uImage...).
86 In gdb the developer specifies the connection parameters and 86 In gdb the developer specifies the connection parameters and
87 connects to kgdb. Depending on which kgdb I/O modules exist in 87 connects to kgdb. The type of connection a developer makes with
88 the kernel for a given architecture, it may be possible to debug 88 gdb depends on the availability of kgdb I/O modules compiled as
89 the test machine's kernel with the development machine using a 89 builtin's or kernel modules in the test machine's kernel.
90 rs232 or ethernet connection.
91 </para> 90 </para>
92 </chapter> 91 </chapter>
93 <chapter id="CompilingAKernel"> 92 <chapter id="CompilingAKernel">
@@ -223,7 +222,7 @@
223 </para> 222 </para>
224 <para> 223 <para>
225 IMPORTANT NOTE: Using this option with kgdb over the console 224 IMPORTANT NOTE: Using this option with kgdb over the console
226 (kgdboc) or kgdb over ethernet (kgdboe) is not supported. 225 (kgdboc) is not supported.
227 </para> 226 </para>
228 </sect1> 227 </sect1>
229 </chapter> 228 </chapter>
@@ -249,18 +248,11 @@
249 (gdb) target remote /dev/ttyS0 248 (gdb) target remote /dev/ttyS0
250 </programlisting> 249 </programlisting>
251 <para> 250 <para>
252 Example (kgdb to a terminal server): 251 Example (kgdb to a terminal server on tcp port 2012):
253 </para> 252 </para>
254 <programlisting> 253 <programlisting>
255 % gdb ./vmlinux 254 % gdb ./vmlinux
256 (gdb) target remote udp:192.168.2.2:6443 255 (gdb) target remote 192.168.2.2:2012
257 </programlisting>
258 <para>
259 Example (kgdb over ethernet):
260 </para>
261 <programlisting>
262 % gdb ./vmlinux
263 (gdb) target remote udp:192.168.2.2:6443
264 </programlisting> 256 </programlisting>
265 <para> 257 <para>
266 Once connected, you can debug a kernel the way you would debug an 258 Once connected, you can debug a kernel the way you would debug an
diff --git a/Makefile b/Makefile
index 2b4977c9844e..6aff5f47c21d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
1VERSION = 2 1VERSION = 2
2PATCHLEVEL = 6 2PATCHLEVEL = 6
3SUBLEVEL = 26 3SUBLEVEL = 26
4EXTRAVERSION = -rc7 4EXTRAVERSION = -rc8
5NAME = Rotary Wombat 5NAME = Rotary Wombat
6 6
7# *DOCUMENTATION* 7# *DOCUMENTATION*
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 082c31dcfd99..39752cdef6ff 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -558,8 +558,6 @@ static struct iosapic_rte_info * __init_refok iosapic_alloc_rte (void)
558 if (!iosapic_kmalloc_ok && list_empty(&free_rte_list)) { 558 if (!iosapic_kmalloc_ok && list_empty(&free_rte_list)) {
559 rte = alloc_bootmem(sizeof(struct iosapic_rte_info) * 559 rte = alloc_bootmem(sizeof(struct iosapic_rte_info) *
560 NR_PREALLOCATE_RTE_ENTRIES); 560 NR_PREALLOCATE_RTE_ENTRIES);
561 if (!rte)
562 return NULL;
563 for (i = 0; i < NR_PREALLOCATE_RTE_ENTRIES; i++, rte++) 561 for (i = 0; i < NR_PREALLOCATE_RTE_ENTRIES; i++, rte++)
564 list_add(&rte->rte_list, &free_rte_list); 562 list_add(&rte->rte_list, &free_rte_list);
565 } 563 }
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index f48a809c686d..4ae15c8c2488 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -578,8 +578,6 @@ setup_arch (char **cmdline_p)
578 cpu_init(); /* initialize the bootstrap CPU */ 578 cpu_init(); /* initialize the bootstrap CPU */
579 mmu_context_init(); /* initialize context_id bitmap */ 579 mmu_context_init(); /* initialize context_id bitmap */
580 580
581 check_sal_cache_flush();
582
583#ifdef CONFIG_ACPI 581#ifdef CONFIG_ACPI
584 acpi_boot_init(); 582 acpi_boot_init();
585#endif 583#endif
@@ -607,6 +605,7 @@ setup_arch (char **cmdline_p)
607 ia64_mca_init(); 605 ia64_mca_init();
608 606
609 platform_setup(cmdline_p); 607 platform_setup(cmdline_p);
608 check_sal_cache_flush();
610 paging_init(); 609 paging_init();
611} 610}
612 611
diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index 6dd886c5d860..e585f9a2afb9 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -512,7 +512,7 @@ static ssize_t sn2_ptc_proc_write(struct file *file, const char __user *user, si
512 int cpu; 512 int cpu;
513 char optstr[64]; 513 char optstr[64];
514 514
515 if (count > sizeof(optstr)) 515 if (count == 0 || count > sizeof(optstr))
516 return -EINVAL; 516 return -EINVAL;
517 if (copy_from_user(optstr, user, count)) 517 if (copy_from_user(optstr, user, count))
518 return -EFAULT; 518 return -EFAULT;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 52e18e6d2ba0..e0edaaa6920a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -383,6 +383,7 @@ config VMI
383config KVM_CLOCK 383config KVM_CLOCK
384 bool "KVM paravirtualized clock" 384 bool "KVM paravirtualized clock"
385 select PARAVIRT 385 select PARAVIRT
386 select PARAVIRT_CLOCK
386 depends on !(X86_VISWS || X86_VOYAGER) 387 depends on !(X86_VISWS || X86_VOYAGER)
387 help 388 help
388 Turning on this option will allow you to run a paravirtualized clock 389 Turning on this option will allow you to run a paravirtualized clock
@@ -410,6 +411,10 @@ config PARAVIRT
410 over full virtualization. However, when run without a hypervisor 411 over full virtualization. However, when run without a hypervisor
411 the kernel is theoretically slower and slightly larger. 412 the kernel is theoretically slower and slightly larger.
412 413
414config PARAVIRT_CLOCK
415 bool
416 default n
417
413endif 418endif
414 419
415config MEMTEST_BOOTPARAM 420config MEMTEST_BOOTPARAM
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b4720..77807d4769c9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
82obj-$(CONFIG_KVM_GUEST) += kvm.o 82obj-$(CONFIG_KVM_GUEST) += kvm.o
83obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 83obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
84obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 84obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
85obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
85 86
86obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 87obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
87 88
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 08a30986d472..87edf1ceb1df 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/clocksource.h> 19#include <linux/clocksource.h>
20#include <linux/kvm_para.h> 20#include <linux/kvm_para.h>
21#include <asm/pvclock.h>
21#include <asm/arch_hooks.h> 22#include <asm/arch_hooks.h>
22#include <asm/msr.h> 23#include <asm/msr.h>
23#include <asm/apic.h> 24#include <asm/apic.h>
@@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg)
36early_param("no-kvmclock", parse_no_kvmclock); 37early_param("no-kvmclock", parse_no_kvmclock);
37 38
38/* The hypervisor will put information about time periodically here */ 39/* The hypervisor will put information about time periodically here */
39static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); 40static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
40#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field 41static struct pvclock_wall_clock wall_clock;
41 42
42static inline u64 kvm_get_delta(u64 last_tsc)
43{
44 int cpu = smp_processor_id();
45 u64 delta = native_read_tsc() - last_tsc;
46 return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
47}
48
49static struct kvm_wall_clock wall_clock;
50static cycle_t kvm_clock_read(void);
51/* 43/*
52 * The wallclock is the time of day when we booted. Since then, some time may 44 * The wallclock is the time of day when we booted. Since then, some time may
53 * have elapsed since the hypervisor wrote the data. So we try to account for 45 * have elapsed since the hypervisor wrote the data. So we try to account for
@@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void);
55 */ 47 */
56static unsigned long kvm_get_wallclock(void) 48static unsigned long kvm_get_wallclock(void)
57{ 49{
58 u32 wc_sec, wc_nsec; 50 struct pvclock_vcpu_time_info *vcpu_time;
59 u64 delta;
60 struct timespec ts; 51 struct timespec ts;
61 int version, nsec;
62 int low, high; 52 int low, high;
63 53
64 low = (int)__pa(&wall_clock); 54 low = (int)__pa(&wall_clock);
65 high = ((u64)__pa(&wall_clock) >> 32); 55 high = ((u64)__pa(&wall_clock) >> 32);
56 native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
66 57
67 delta = kvm_clock_read(); 58 vcpu_time = &get_cpu_var(hv_clock);
59 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
60 put_cpu_var(hv_clock);
68 61
69 native_write_msr(MSR_KVM_WALL_CLOCK, low, high); 62 return ts.tv_sec;
70 do {
71 version = wall_clock.wc_version;
72 rmb();
73 wc_sec = wall_clock.wc_sec;
74 wc_nsec = wall_clock.wc_nsec;
75 rmb();
76 } while ((wall_clock.wc_version != version) || (version & 1));
77
78 delta = kvm_clock_read() - delta;
79 delta += wc_nsec;
80 nsec = do_div(delta, NSEC_PER_SEC);
81 set_normalized_timespec(&ts, wc_sec + delta, nsec);
82 /*
83 * Of all mechanisms of time adjustment I've tested, this one
84 * was the champion!
85 */
86 return ts.tv_sec + 1;
87} 63}
88 64
89static int kvm_set_wallclock(unsigned long now) 65static int kvm_set_wallclock(unsigned long now)
90{ 66{
91 return 0; 67 return -1;
92} 68}
93 69
94/*
95 * This is our read_clock function. The host puts an tsc timestamp each time
96 * it updates a new time. Without the tsc adjustment, we can have a situation
97 * in which a vcpu starts to run earlier (smaller system_time), but probes
98 * time later (compared to another vcpu), leading to backwards time
99 */
100static cycle_t kvm_clock_read(void) 70static cycle_t kvm_clock_read(void)
101{ 71{
102 u64 last_tsc, now; 72 struct pvclock_vcpu_time_info *src;
103 int cpu; 73 cycle_t ret;
104 74
105 preempt_disable(); 75 src = &get_cpu_var(hv_clock);
106 cpu = smp_processor_id(); 76 ret = pvclock_clocksource_read(src);
107 77 put_cpu_var(hv_clock);
108 last_tsc = get_clock(cpu, tsc_timestamp); 78 return ret;
109 now = get_clock(cpu, system_time);
110
111 now += kvm_get_delta(last_tsc);
112 preempt_enable();
113
114 return now;
115} 79}
80
116static struct clocksource kvm_clock = { 81static struct clocksource kvm_clock = {
117 .name = "kvm-clock", 82 .name = "kvm-clock",
118 .read = kvm_clock_read, 83 .read = kvm_clock_read,
@@ -123,13 +88,14 @@ static struct clocksource kvm_clock = {
123 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 88 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
124}; 89};
125 90
126static int kvm_register_clock(void) 91static int kvm_register_clock(char *txt)
127{ 92{
128 int cpu = smp_processor_id(); 93 int cpu = smp_processor_id();
129 int low, high; 94 int low, high;
130 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; 95 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
131 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 96 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
132 97 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
98 cpu, high, low, txt);
133 return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); 99 return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
134} 100}
135 101
@@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void)
140 * Now that the first cpu already had this clocksource initialized, 106 * Now that the first cpu already had this clocksource initialized,
141 * we shouldn't fail. 107 * we shouldn't fail.
142 */ 108 */
143 WARN_ON(kvm_register_clock()); 109 WARN_ON(kvm_register_clock("secondary cpu clock"));
144 /* ok, done with our trickery, call native */ 110 /* ok, done with our trickery, call native */
145 setup_secondary_APIC_clock(); 111 setup_secondary_APIC_clock();
146} 112}
147#endif 113#endif
148 114
115#ifdef CONFIG_SMP
116void __init kvm_smp_prepare_boot_cpu(void)
117{
118 WARN_ON(kvm_register_clock("primary cpu clock"));
119 native_smp_prepare_boot_cpu();
120}
121#endif
122
149/* 123/*
150 * After the clock is registered, the host will keep writing to the 124 * After the clock is registered, the host will keep writing to the
151 * registered memory location. If the guest happens to shutdown, this memory 125 * registered memory location. If the guest happens to shutdown, this memory
@@ -174,7 +148,7 @@ void __init kvmclock_init(void)
174 return; 148 return;
175 149
176 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { 150 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
177 if (kvm_register_clock()) 151 if (kvm_register_clock("boot clock"))
178 return; 152 return;
179 pv_time_ops.get_wallclock = kvm_get_wallclock; 153 pv_time_ops.get_wallclock = kvm_get_wallclock;
180 pv_time_ops.set_wallclock = kvm_set_wallclock; 154 pv_time_ops.set_wallclock = kvm_set_wallclock;
@@ -182,6 +156,9 @@ void __init kvmclock_init(void)
182#ifdef CONFIG_X86_LOCAL_APIC 156#ifdef CONFIG_X86_LOCAL_APIC
183 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; 157 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
184#endif 158#endif
159#ifdef CONFIG_SMP
160 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
161#endif
185 machine_ops.shutdown = kvm_shutdown; 162 machine_ops.shutdown = kvm_shutdown;
186#ifdef CONFIG_KEXEC 163#ifdef CONFIG_KEXEC
187 machine_ops.crash_shutdown = kvm_crash_shutdown; 164 machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
new file mode 100644
index 000000000000..05fbe9a0325a
--- /dev/null
+++ b/arch/x86/kernel/pvclock.c
@@ -0,0 +1,141 @@
1/* paravirtual clock -- common code used by kvm/xen
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16*/
17
18#include <linux/kernel.h>
19#include <linux/percpu.h>
20#include <asm/pvclock.h>
21
22/*
23 * These are perodically updated
24 * xen: magic shared_info page
25 * kvm: gpa registered via msr
26 * and then copied here.
27 */
28struct pvclock_shadow_time {
29 u64 tsc_timestamp; /* TSC at last update of time vals. */
30 u64 system_timestamp; /* Time, in nanosecs, since boot. */
31 u32 tsc_to_nsec_mul;
32 int tsc_shift;
33 u32 version;
34};
35
36/*
37 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
38 * yielding a 64-bit result.
39 */
40static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
41{
42 u64 product;
43#ifdef __i386__
44 u32 tmp1, tmp2;
45#endif
46
47 if (shift < 0)
48 delta >>= -shift;
49 else
50 delta <<= shift;
51
52#ifdef __i386__
53 __asm__ (
54 "mul %5 ; "
55 "mov %4,%%eax ; "
56 "mov %%edx,%4 ; "
57 "mul %5 ; "
58 "xor %5,%5 ; "
59 "add %4,%%eax ; "
60 "adc %5,%%edx ; "
61 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
62 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
63#elif __x86_64__
64 __asm__ (
65 "mul %%rdx ; shrd $32,%%rdx,%%rax"
66 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
67#else
68#error implement me!
69#endif
70
71 return product;
72}
73
74static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
75{
76 u64 delta = native_read_tsc() - shadow->tsc_timestamp;
77 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
78}
79
80/*
81 * Reads a consistent set of time-base values from hypervisor,
82 * into a shadow data area.
83 */
84static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
85 struct pvclock_vcpu_time_info *src)
86{
87 do {
88 dst->version = src->version;
89 rmb(); /* fetch version before data */
90 dst->tsc_timestamp = src->tsc_timestamp;
91 dst->system_timestamp = src->system_time;
92 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
93 dst->tsc_shift = src->tsc_shift;
94 rmb(); /* test version after fetching data */
95 } while ((src->version & 1) || (dst->version != src->version));
96
97 return dst->version;
98}
99
100cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
101{
102 struct pvclock_shadow_time shadow;
103 unsigned version;
104 cycle_t ret, offset;
105
106 do {
107 version = pvclock_get_time_values(&shadow, src);
108 barrier();
109 offset = pvclock_get_nsec_offset(&shadow);
110 ret = shadow.system_timestamp + offset;
111 barrier();
112 } while (version != src->version);
113
114 return ret;
115}
116
117void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
118 struct pvclock_vcpu_time_info *vcpu_time,
119 struct timespec *ts)
120{
121 u32 version;
122 u64 delta;
123 struct timespec now;
124
125 /* get wallclock at system boot */
126 do {
127 version = wall_clock->version;
128 rmb(); /* fetch version before time */
129 now.tv_sec = wall_clock->sec;
130 now.tv_nsec = wall_clock->nsec;
131 rmb(); /* fetch time before checking version */
132 } while ((wall_clock->version & 1) || (version != wall_clock->version));
133
134 delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
135 delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
136
137 now.tv_nsec = do_div(delta, NSEC_PER_SEC);
138 now.tv_sec = delta;
139
140 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
141}
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index f2f5d260874e..3829aa7b663f 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,9 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
200 200
201 atomic_inc(&pt->pending); 201 atomic_inc(&pt->pending);
202 smp_mb__after_atomic_inc(); 202 smp_mb__after_atomic_inc();
203 if (vcpu0 && waitqueue_active(&vcpu0->wq)) { 203 if (vcpu0) {
204 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; 204 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
205 wake_up_interruptible(&vcpu0->wq); 205 if (waitqueue_active(&vcpu0->wq)) {
206 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
207 wake_up_interruptible(&vcpu0->wq);
208 }
206 } 209 }
207 210
208 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); 211 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c297c50eba63..ebc03f5ae162 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
940 wait_queue_head_t *q = &apic->vcpu->wq; 940 wait_queue_head_t *q = &apic->vcpu->wq;
941 941
942 atomic_inc(&apic->timer.pending); 942 atomic_inc(&apic->timer.pending);
943 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
943 if (waitqueue_active(q)) { 944 if (waitqueue_active(q)) {
944 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 945 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
945 wake_up_interruptible(q); 946 wake_up_interruptible(q);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ee3f53098f0c..7e7c3969f7a2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
640 rmap_remove(kvm, spte); 640 rmap_remove(kvm, spte);
641 --kvm->stat.lpages; 641 --kvm->stat.lpages;
642 set_shadow_pte(spte, shadow_trap_nonpresent_pte); 642 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
643 spte = NULL;
643 write_protected = 1; 644 write_protected = 1;
644 } 645 }
645 spte = rmap_next(kvm, rmapp, spte); 646 spte = rmap_next(kvm, rmapp, spte);
@@ -1082,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1082 struct kvm_mmu_page *shadow; 1083 struct kvm_mmu_page *shadow;
1083 1084
1084 spte |= PT_WRITABLE_MASK; 1085 spte |= PT_WRITABLE_MASK;
1085 if (user_fault) {
1086 mmu_unshadow(vcpu->kvm, gfn);
1087 goto unshadowed;
1088 }
1089 1086
1090 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1087 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
1091 if (shadow || 1088 if (shadow ||
@@ -1102,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1102 } 1099 }
1103 } 1100 }
1104 1101
1105unshadowed:
1106
1107 if (pte_access & ACC_WRITE_MASK) 1102 if (pte_access & ACC_WRITE_MASK)
1108 mark_page_dirty(vcpu->kvm, gfn); 1103 mark_page_dirty(vcpu->kvm, gfn);
1109 1104
@@ -1580,11 +1575,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1580 u64 *spte, 1575 u64 *spte,
1581 const void *new) 1576 const void *new)
1582{ 1577{
1583 if ((sp->role.level != PT_PAGE_TABLE_LEVEL) 1578 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
1584 && !vcpu->arch.update_pte.largepage) { 1579 if (!vcpu->arch.update_pte.largepage ||
1585 ++vcpu->kvm->stat.mmu_pde_zapped; 1580 sp->role.glevels == PT32_ROOT_LEVEL) {
1586 return; 1581 ++vcpu->kvm->stat.mmu_pde_zapped;
1587 } 1582 return;
1583 }
1584 }
1588 1585
1589 ++vcpu->kvm->stat.mmu_pte_updated; 1586 ++vcpu->kvm->stat.mmu_pte_updated;
1590 if (sp->role.glevels == PT32_ROOT_LEVEL) 1587 if (sp->role.glevels == PT32_ROOT_LEVEL)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 02efbe75f317..540e95179074 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
566 load_transition_efer(vmx); 566 load_transition_efer(vmx);
567} 567}
568 568
569static void vmx_load_host_state(struct vcpu_vmx *vmx) 569static void __vmx_load_host_state(struct vcpu_vmx *vmx)
570{ 570{
571 unsigned long flags; 571 unsigned long flags;
572 572
@@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
596 reload_host_efer(vmx); 596 reload_host_efer(vmx);
597} 597}
598 598
599static void vmx_load_host_state(struct vcpu_vmx *vmx)
600{
601 preempt_disable();
602 __vmx_load_host_state(vmx);
603 preempt_enable();
604}
605
599/* 606/*
600 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 607 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
601 * vcpu mutex is already taken. 608 * vcpu mutex is already taken.
@@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
654 661
655static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 662static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
656{ 663{
657 vmx_load_host_state(to_vmx(vcpu)); 664 __vmx_load_host_state(to_vmx(vcpu));
658} 665}
659 666
660static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 667static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
884 switch (msr_index) { 891 switch (msr_index) {
885#ifdef CONFIG_X86_64 892#ifdef CONFIG_X86_64
886 case MSR_EFER: 893 case MSR_EFER:
894 vmx_load_host_state(vmx);
887 ret = kvm_set_msr_common(vcpu, msr_index, data); 895 ret = kvm_set_msr_common(vcpu, msr_index, data);
888 if (vmx->host_state.loaded) {
889 reload_host_efer(vmx);
890 load_transition_efer(vmx);
891 }
892 break; 896 break;
893 case MSR_FS_BASE: 897 case MSR_FS_BASE:
894 vmcs_writel(GUEST_FS_BASE, data); 898 vmcs_writel(GUEST_FS_BASE, data);
@@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
910 guest_write_tsc(data); 914 guest_write_tsc(data);
911 break; 915 break;
912 default: 916 default:
917 vmx_load_host_state(vmx);
913 msr = find_msr_entry(vmx, msr_index); 918 msr = find_msr_entry(vmx, msr_index);
914 if (msr) { 919 if (msr) {
915 msr->data = data; 920 msr->data = data;
916 if (vmx->host_state.loaded)
917 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
918 break; 921 break;
919 } 922 }
920 ret = kvm_set_msr_common(vcpu, msr_index, data); 923 ret = kvm_set_msr_common(vcpu, msr_index, data);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00acf1301a15..63a77caa59f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
492static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 492static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
493{ 493{
494 static int version; 494 static int version;
495 struct kvm_wall_clock wc; 495 struct pvclock_wall_clock wc;
496 struct timespec wc_ts; 496 struct timespec now, sys, boot;
497 497
498 if (!wall_clock) 498 if (!wall_clock)
499 return; 499 return;
@@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
502 502
503 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 503 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
504 504
505 wc_ts = current_kernel_time(); 505 /*
506 wc.wc_sec = wc_ts.tv_sec; 506 * The guest calculates current wall clock time by adding
507 wc.wc_nsec = wc_ts.tv_nsec; 507 * system time (updated by kvm_write_guest_time below) to the
508 wc.wc_version = version; 508 * wall clock specified here. guest system time equals host
509 * system time for us, thus we must fill in host boot time here.
510 */
511 now = current_kernel_time();
512 ktime_get_ts(&sys);
513 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
514
515 wc.sec = boot.tv_sec;
516 wc.nsec = boot.tv_nsec;
517 wc.version = version;
509 518
510 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 519 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
511 520
@@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
513 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 522 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
514} 523}
515 524
525static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
526{
527 uint32_t quotient, remainder;
528
529 /* Don't try to replace with do_div(), this one calculates
530 * "(dividend << 32) / divisor" */
531 __asm__ ( "divl %4"
532 : "=a" (quotient), "=d" (remainder)
533 : "0" (0), "1" (dividend), "r" (divisor) );
534 return quotient;
535}
536
537static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
538{
539 uint64_t nsecs = 1000000000LL;
540 int32_t shift = 0;
541 uint64_t tps64;
542 uint32_t tps32;
543
544 tps64 = tsc_khz * 1000LL;
545 while (tps64 > nsecs*2) {
546 tps64 >>= 1;
547 shift--;
548 }
549
550 tps32 = (uint32_t)tps64;
551 while (tps32 <= (uint32_t)nsecs) {
552 tps32 <<= 1;
553 shift++;
554 }
555
556 hv_clock->tsc_shift = shift;
557 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
558
559 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
560 __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
561 hv_clock->tsc_to_system_mul);
562}
563
516static void kvm_write_guest_time(struct kvm_vcpu *v) 564static void kvm_write_guest_time(struct kvm_vcpu *v)
517{ 565{
518 struct timespec ts; 566 struct timespec ts;
@@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
523 if ((!vcpu->time_page)) 571 if ((!vcpu->time_page))
524 return; 572 return;
525 573
574 if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
575 kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
576 vcpu->hv_clock_tsc_khz = tsc_khz;
577 }
578
526 /* Keep irq disabled to prevent changes to the clock */ 579 /* Keep irq disabled to prevent changes to the clock */
527 local_irq_save(flags); 580 local_irq_save(flags);
528 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, 581 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
@@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
537 /* 590 /*
538 * The interface expects us to write an even number signaling that the 591 * The interface expects us to write an even number signaling that the
539 * update is finished. Since the guest won't see the intermediate 592 * update is finished. Since the guest won't see the intermediate
540 * state, we just write "2" at the end 593 * state, we just increase by 2 at the end.
541 */ 594 */
542 vcpu->hv_clock.version = 2; 595 vcpu->hv_clock.version += 2;
543 596
544 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 597 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
545 598
546 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 599 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
547 sizeof(vcpu->hv_clock)); 600 sizeof(vcpu->hv_clock));
548 601
549 kunmap_atomic(shared_kaddr, KM_USER0); 602 kunmap_atomic(shared_kaddr, KM_USER0);
550 603
@@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
599 /* ...but clean it before doing the actual write */ 652 /* ...but clean it before doing the actual write */
600 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 653 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
601 654
602 vcpu->arch.hv_clock.tsc_to_system_mul =
603 clocksource_khz2mult(tsc_khz, 22);
604 vcpu->arch.hv_clock.tsc_shift = 22;
605
606 down_read(&current->mm->mmap_sem); 655 down_read(&current->mm->mmap_sem);
607 vcpu->arch.time_page = 656 vcpu->arch.time_page =
608 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 657 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
@@ -2759,6 +2808,8 @@ again:
2759 if (vcpu->requests) { 2808 if (vcpu->requests) {
2760 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 2809 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2761 __kvm_migrate_timers(vcpu); 2810 __kvm_migrate_timers(vcpu);
2811 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2812 kvm_x86_ops->tlb_flush(vcpu);
2762 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 2813 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
2763 &vcpu->requests)) { 2814 &vcpu->requests)) {
2764 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 2815 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -2772,6 +2823,7 @@ again:
2772 } 2823 }
2773 } 2824 }
2774 2825
2826 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
2775 kvm_inject_pending_timer_irqs(vcpu); 2827 kvm_inject_pending_timer_irqs(vcpu);
2776 2828
2777 preempt_disable(); 2829 preempt_disable();
@@ -2781,21 +2833,13 @@ again:
2781 2833
2782 local_irq_disable(); 2834 local_irq_disable();
2783 2835
2784 if (need_resched()) { 2836 if (vcpu->requests || need_resched()) {
2785 local_irq_enable(); 2837 local_irq_enable();
2786 preempt_enable(); 2838 preempt_enable();
2787 r = 1; 2839 r = 1;
2788 goto out; 2840 goto out;
2789 } 2841 }
2790 2842
2791 if (vcpu->requests)
2792 if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
2793 local_irq_enable();
2794 preempt_enable();
2795 r = 1;
2796 goto out;
2797 }
2798
2799 if (signal_pending(current)) { 2843 if (signal_pending(current)) {
2800 local_irq_enable(); 2844 local_irq_enable();
2801 preempt_enable(); 2845 preempt_enable();
@@ -2825,9 +2869,6 @@ again:
2825 2869
2826 kvm_guest_enter(); 2870 kvm_guest_enter();
2827 2871
2828 if (vcpu->requests)
2829 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2830 kvm_x86_ops->tlb_flush(vcpu);
2831 2872
2832 KVMTRACE_0D(VMENTRY, vcpu, entryexit); 2873 KVMTRACE_0D(VMENTRY, vcpu, entryexit);
2833 kvm_x86_ops->run(vcpu, kvm_run); 2874 kvm_x86_ops->run(vcpu, kvm_run);
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 2e641be2737e..6c388e593bc8 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -5,8 +5,9 @@
5config XEN 5config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK
8 depends on X86_32 9 depends on X86_32
9 depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
10 help 11 help
11 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
12 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c8a56e457d61..f09c1c69c37a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
785static __init void xen_pagetable_setup_start(pgd_t *base) 785static __init void xen_pagetable_setup_start(pgd_t *base)
786{ 786{
787 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; 787 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
788 int i;
788 789
789 /* special set_pte for pagetable initialization */ 790 /* special set_pte for pagetable initialization */
790 pv_mmu_ops.set_pte = xen_set_pte_init; 791 pv_mmu_ops.set_pte = xen_set_pte_init;
791 792
792 init_mm.pgd = base; 793 init_mm.pgd = base;
793 /* 794 /*
794 * copy top-level of Xen-supplied pagetable into place. For 795 * copy top-level of Xen-supplied pagetable into place. This
795 * !PAE we can use this as-is, but for PAE it is a stand-in 796 * is a stand-in while we copy the pmd pages.
796 * while we copy the pmd pages.
797 */ 797 */
798 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); 798 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
799 799
800 if (PTRS_PER_PMD > 1) { 800 /*
801 int i; 801 * For PAE, need to allocate new pmds, rather than
802 /* 802 * share Xen's, since Xen doesn't like pmd's being
803 * For PAE, need to allocate new pmds, rather than 803 * shared between address spaces.
804 * share Xen's, since Xen doesn't like pmd's being 804 */
805 * shared between address spaces. 805 for (i = 0; i < PTRS_PER_PGD; i++) {
806 */ 806 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
807 for (i = 0; i < PTRS_PER_PGD; i++) { 807 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
808 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
809 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
810 808
811 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), 809 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
812 PAGE_SIZE); 810 PAGE_SIZE);
813 811
814 make_lowmem_page_readonly(pmd); 812 make_lowmem_page_readonly(pmd);
815 813
816 set_pgd(&base[i], __pgd(1 + __pa(pmd))); 814 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
817 } else 815 } else
818 pgd_clear(&base[i]); 816 pgd_clear(&base[i]);
819 }
820 } 817 }
821 818
822 /* make sure zero_page is mapped RO so we can use it in pagetables */ 819 /* make sure zero_page is mapped RO so we can use it in pagetables */
@@ -873,17 +870,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
873 870
874 /* Actually pin the pagetable down, but we can't set PG_pinned 871 /* Actually pin the pagetable down, but we can't set PG_pinned
875 yet because the page structures don't exist yet. */ 872 yet because the page structures don't exist yet. */
876 { 873 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
877 unsigned level;
878
879#ifdef CONFIG_X86_PAE
880 level = MMUEXT_PIN_L3_TABLE;
881#else
882 level = MMUEXT_PIN_L2_TABLE;
883#endif
884
885 pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
886 }
887} 874}
888 875
889/* This is called once we have the cpu_possible_map */ 876/* This is called once we have the cpu_possible_map */
@@ -1093,7 +1080,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1093 .make_pte = xen_make_pte, 1080 .make_pte = xen_make_pte,
1094 .make_pgd = xen_make_pgd, 1081 .make_pgd = xen_make_pgd,
1095 1082
1096#ifdef CONFIG_X86_PAE
1097 .set_pte_atomic = xen_set_pte_atomic, 1083 .set_pte_atomic = xen_set_pte_atomic,
1098 .set_pte_present = xen_set_pte_at, 1084 .set_pte_present = xen_set_pte_at,
1099 .set_pud = xen_set_pud, 1085 .set_pud = xen_set_pud,
@@ -1102,7 +1088,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1102 1088
1103 .make_pmd = xen_make_pmd, 1089 .make_pmd = xen_make_pmd,
1104 .pmd_val = xen_pmd_val, 1090 .pmd_val = xen_pmd_val,
1105#endif /* PAE */
1106 1091
1107 .activate_mm = xen_activate_mm, 1092 .activate_mm = xen_activate_mm,
1108 .dup_mmap = xen_dup_mmap, 1093 .dup_mmap = xen_dup_mmap,
@@ -1228,6 +1213,11 @@ asmlinkage void __init xen_start_kernel(void)
1228 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1213 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1229 pv_info.kernel_rpl = 0; 1214 pv_info.kernel_rpl = 0;
1230 1215
1216 /* Prevent unwanted bits from being set in PTEs. */
1217 __supported_pte_mask &= ~_PAGE_GLOBAL;
1218 if (!is_initial_xendomain())
1219 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1220
1231 /* set the limit of our address space */ 1221 /* set the limit of our address space */
1232 xen_reserve_top(); 1222 xen_reserve_top();
1233 1223
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3525ef523a74..df40bf74ea75 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -179,50 +179,56 @@ out:
179 preempt_enable(); 179 preempt_enable();
180} 180}
181 181
182pteval_t xen_pte_val(pte_t pte) 182/* Assume pteval_t is equivalent to all the other *val_t types. */
183static pteval_t pte_mfn_to_pfn(pteval_t val)
184{
185 if (val & _PAGE_PRESENT) {
186 unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT;
187 pteval_t flags = val & ~PTE_MASK;
188 val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
189 }
190
191 return val;
192}
193
194static pteval_t pte_pfn_to_mfn(pteval_t val)
183{ 195{
184 pteval_t ret = pte.pte; 196 if (val & _PAGE_PRESENT) {
197 unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT;
198 pteval_t flags = val & ~PTE_MASK;
199 val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
200 }
185 201
186 if (ret & _PAGE_PRESENT) 202 return val;
187 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; 203}
188 204
189 return ret; 205pteval_t xen_pte_val(pte_t pte)
206{
207 return pte_mfn_to_pfn(pte.pte);
190} 208}
191 209
192pgdval_t xen_pgd_val(pgd_t pgd) 210pgdval_t xen_pgd_val(pgd_t pgd)
193{ 211{
194 pgdval_t ret = pgd.pgd; 212 return pte_mfn_to_pfn(pgd.pgd);
195 if (ret & _PAGE_PRESENT)
196 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
197 return ret;
198} 213}
199 214
200pte_t xen_make_pte(pteval_t pte) 215pte_t xen_make_pte(pteval_t pte)
201{ 216{
202 if (pte & _PAGE_PRESENT) { 217 pte = pte_pfn_to_mfn(pte);
203 pte = phys_to_machine(XPADDR(pte)).maddr; 218 return native_make_pte(pte);
204 pte &= ~(_PAGE_PCD | _PAGE_PWT);
205 }
206
207 return (pte_t){ .pte = pte };
208} 219}
209 220
210pgd_t xen_make_pgd(pgdval_t pgd) 221pgd_t xen_make_pgd(pgdval_t pgd)
211{ 222{
212 if (pgd & _PAGE_PRESENT) 223 pgd = pte_pfn_to_mfn(pgd);
213 pgd = phys_to_machine(XPADDR(pgd)).maddr; 224 return native_make_pgd(pgd);
214
215 return (pgd_t){ pgd };
216} 225}
217 226
218pmdval_t xen_pmd_val(pmd_t pmd) 227pmdval_t xen_pmd_val(pmd_t pmd)
219{ 228{
220 pmdval_t ret = native_pmd_val(pmd); 229 return pte_mfn_to_pfn(pmd.pmd);
221 if (ret & _PAGE_PRESENT)
222 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
223 return ret;
224} 230}
225#ifdef CONFIG_X86_PAE 231
226void xen_set_pud(pud_t *ptr, pud_t val) 232void xen_set_pud(pud_t *ptr, pud_t val)
227{ 233{
228 struct multicall_space mcs; 234 struct multicall_space mcs;
@@ -267,17 +273,9 @@ void xen_pmd_clear(pmd_t *pmdp)
267 273
268pmd_t xen_make_pmd(pmdval_t pmd) 274pmd_t xen_make_pmd(pmdval_t pmd)
269{ 275{
270 if (pmd & _PAGE_PRESENT) 276 pmd = pte_pfn_to_mfn(pmd);
271 pmd = phys_to_machine(XPADDR(pmd)).maddr;
272
273 return native_make_pmd(pmd); 277 return native_make_pmd(pmd);
274} 278}
275#else /* !PAE */
276void xen_set_pte(pte_t *ptep, pte_t pte)
277{
278 *ptep = pte;
279}
280#endif /* CONFIG_X86_PAE */
281 279
282/* 280/*
283 (Yet another) pagetable walker. This one is intended for pinning a 281 (Yet another) pagetable walker. This one is intended for pinning a
@@ -430,8 +428,6 @@ static int pin_page(struct page *page, enum pt_level level)
430 read-only, and can be pinned. */ 428 read-only, and can be pinned. */
431void xen_pgd_pin(pgd_t *pgd) 429void xen_pgd_pin(pgd_t *pgd)
432{ 430{
433 unsigned level;
434
435 xen_mc_batch(); 431 xen_mc_batch();
436 432
437 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 433 if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
@@ -441,14 +437,7 @@ void xen_pgd_pin(pgd_t *pgd)
441 xen_mc_batch(); 437 xen_mc_batch();
442 } 438 }
443 439
444#ifdef CONFIG_X86_PAE 440 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
445 level = MMUEXT_PIN_L3_TABLE;
446#else
447 level = MMUEXT_PIN_L2_TABLE;
448#endif
449
450 xen_do_pin(level, PFN_DOWN(__pa(pgd)));
451
452 xen_mc_issue(0); 441 xen_mc_issue(0);
453} 442}
454 443
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index b5e189b1519d..5fe961caffd4 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -37,14 +37,13 @@ void xen_exit_mmap(struct mm_struct *mm);
37void xen_pgd_pin(pgd_t *pgd); 37void xen_pgd_pin(pgd_t *pgd);
38//void xen_pgd_unpin(pgd_t *pgd); 38//void xen_pgd_unpin(pgd_t *pgd);
39 39
40#ifdef CONFIG_X86_PAE 40pteval_t xen_pte_val(pte_t);
41unsigned long long xen_pte_val(pte_t); 41pmdval_t xen_pmd_val(pmd_t);
42unsigned long long xen_pmd_val(pmd_t); 42pgdval_t xen_pgd_val(pgd_t);
43unsigned long long xen_pgd_val(pgd_t);
44 43
45pte_t xen_make_pte(unsigned long long); 44pte_t xen_make_pte(pteval_t);
46pmd_t xen_make_pmd(unsigned long long); 45pmd_t xen_make_pmd(pmdval_t);
47pgd_t xen_make_pgd(unsigned long long); 46pgd_t xen_make_pgd(pgdval_t);
48 47
49void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 48void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
50 pte_t *ptep, pte_t pteval); 49 pte_t *ptep, pte_t pteval);
@@ -53,15 +52,4 @@ void xen_set_pud(pud_t *ptr, pud_t val);
53void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 52void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
54void xen_pmd_clear(pmd_t *pmdp); 53void xen_pmd_clear(pmd_t *pmdp);
55 54
56
57#else
58unsigned long xen_pte_val(pte_t);
59unsigned long xen_pmd_val(pmd_t);
60unsigned long xen_pgd_val(pgd_t);
61
62pte_t xen_make_pte(unsigned long);
63pmd_t xen_make_pmd(unsigned long);
64pgd_t xen_make_pgd(unsigned long);
65#endif
66
67#endif /* _XEN_MMU_H */ 55#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 52b2e3856980..41e217503c96 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -14,6 +14,7 @@
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/math64.h> 15#include <linux/math64.h>
16 16
17#include <asm/pvclock.h>
17#include <asm/xen/hypervisor.h> 18#include <asm/xen/hypervisor.h>
18#include <asm/xen/hypercall.h> 19#include <asm/xen/hypercall.h>
19 20
@@ -31,17 +32,6 @@
31 32
32static cycle_t xen_clocksource_read(void); 33static cycle_t xen_clocksource_read(void);
33 34
34/* These are perodically updated in shared_info, and then copied here. */
35struct shadow_time_info {
36 u64 tsc_timestamp; /* TSC at last update of time vals. */
37 u64 system_timestamp; /* Time, in nanosecs, since boot. */
38 u32 tsc_to_nsec_mul;
39 int tsc_shift;
40 u32 version;
41};
42
43static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
44
45/* runstate info updated by Xen */ 35/* runstate info updated by Xen */
46static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); 36static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
47 37
@@ -211,7 +201,7 @@ unsigned long long xen_sched_clock(void)
211unsigned long xen_cpu_khz(void) 201unsigned long xen_cpu_khz(void)
212{ 202{
213 u64 xen_khz = 1000000ULL << 32; 203 u64 xen_khz = 1000000ULL << 32;
214 const struct vcpu_time_info *info = 204 const struct pvclock_vcpu_time_info *info =
215 &HYPERVISOR_shared_info->vcpu_info[0].time; 205 &HYPERVISOR_shared_info->vcpu_info[0].time;
216 206
217 do_div(xen_khz, info->tsc_to_system_mul); 207 do_div(xen_khz, info->tsc_to_system_mul);
@@ -223,121 +213,26 @@ unsigned long xen_cpu_khz(void)
223 return xen_khz; 213 return xen_khz;
224} 214}
225 215
226/*
227 * Reads a consistent set of time-base values from Xen, into a shadow data
228 * area.
229 */
230static unsigned get_time_values_from_xen(void)
231{
232 struct vcpu_time_info *src;
233 struct shadow_time_info *dst;
234
235 /* src is shared memory with the hypervisor, so we need to
236 make sure we get a consistent snapshot, even in the face of
237 being preempted. */
238 src = &__get_cpu_var(xen_vcpu)->time;
239 dst = &__get_cpu_var(shadow_time);
240
241 do {
242 dst->version = src->version;
243 rmb(); /* fetch version before data */
244 dst->tsc_timestamp = src->tsc_timestamp;
245 dst->system_timestamp = src->system_time;
246 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
247 dst->tsc_shift = src->tsc_shift;
248 rmb(); /* test version after fetching data */
249 } while ((src->version & 1) | (dst->version ^ src->version));
250
251 return dst->version;
252}
253
254/*
255 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
256 * yielding a 64-bit result.
257 */
258static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
259{
260 u64 product;
261#ifdef __i386__
262 u32 tmp1, tmp2;
263#endif
264
265 if (shift < 0)
266 delta >>= -shift;
267 else
268 delta <<= shift;
269
270#ifdef __i386__
271 __asm__ (
272 "mul %5 ; "
273 "mov %4,%%eax ; "
274 "mov %%edx,%4 ; "
275 "mul %5 ; "
276 "xor %5,%5 ; "
277 "add %4,%%eax ; "
278 "adc %5,%%edx ; "
279 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
280 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
281#elif __x86_64__
282 __asm__ (
283 "mul %%rdx ; shrd $32,%%rdx,%%rax"
284 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
285#else
286#error implement me!
287#endif
288
289 return product;
290}
291
292static u64 get_nsec_offset(struct shadow_time_info *shadow)
293{
294 u64 now, delta;
295 now = native_read_tsc();
296 delta = now - shadow->tsc_timestamp;
297 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
298}
299
300static cycle_t xen_clocksource_read(void) 216static cycle_t xen_clocksource_read(void)
301{ 217{
302 struct shadow_time_info *shadow = &get_cpu_var(shadow_time); 218 struct pvclock_vcpu_time_info *src;
303 cycle_t ret; 219 cycle_t ret;
304 unsigned version;
305
306 do {
307 version = get_time_values_from_xen();
308 barrier();
309 ret = shadow->system_timestamp + get_nsec_offset(shadow);
310 barrier();
311 } while (version != __get_cpu_var(xen_vcpu)->time.version);
312
313 put_cpu_var(shadow_time);
314 220
221 src = &get_cpu_var(xen_vcpu)->time;
222 ret = pvclock_clocksource_read(src);
223 put_cpu_var(xen_vcpu);
315 return ret; 224 return ret;
316} 225}
317 226
318static void xen_read_wallclock(struct timespec *ts) 227static void xen_read_wallclock(struct timespec *ts)
319{ 228{
320 const struct shared_info *s = HYPERVISOR_shared_info; 229 struct shared_info *s = HYPERVISOR_shared_info;
321 u32 version; 230 struct pvclock_wall_clock *wall_clock = &(s->wc);
322 u64 delta; 231 struct pvclock_vcpu_time_info *vcpu_time;
323 struct timespec now;
324
325 /* get wallclock at system boot */
326 do {
327 version = s->wc_version;
328 rmb(); /* fetch version before time */
329 now.tv_sec = s->wc_sec;
330 now.tv_nsec = s->wc_nsec;
331 rmb(); /* fetch time before checking version */
332 } while ((s->wc_version & 1) | (version ^ s->wc_version));
333 232
334 delta = xen_clocksource_read(); /* time since system boot */ 233 vcpu_time = &get_cpu_var(xen_vcpu)->time;
335 delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; 234 pvclock_read_wallclock(wall_clock, vcpu_time, ts);
336 235 put_cpu_var(xen_vcpu);
337 now.tv_nsec = do_div(delta, NSEC_PER_SEC);
338 now.tv_sec = delta;
339
340 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
341} 236}
342 237
343unsigned long xen_get_wallclock(void) 238unsigned long xen_get_wallclock(void)
@@ -345,7 +240,6 @@ unsigned long xen_get_wallclock(void)
345 struct timespec ts; 240 struct timespec ts;
346 241
347 xen_read_wallclock(&ts); 242 xen_read_wallclock(&ts);
348
349 return ts.tv_sec; 243 return ts.tv_sec;
350} 244}
351 245
@@ -569,8 +463,6 @@ __init void xen_time_init(void)
569{ 463{
570 int cpu = smp_processor_id(); 464 int cpu = smp_processor_id();
571 465
572 get_time_values_from_xen();
573
574 clocksource_register(&xen_clocksource); 466 clocksource_register(&xen_clocksource);
575 467
576 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { 468 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 288d587ce73c..6ec3b4f7719b 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -17,7 +17,7 @@ ENTRY(startup_xen)
17 17
18 __FINIT 18 __FINIT
19 19
20.pushsection .bss.page_aligned 20.pushsection .text
21 .align PAGE_SIZE_asm 21 .align PAGE_SIZE_asm
22ENTRY(hypercall_page) 22ENTRY(hypercall_page)
23 .skip 0x1000 23 .skip 0x1000
@@ -30,11 +30,7 @@ ENTRY(hypercall_page)
30 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) 30 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
31 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) 31 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
32 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 32 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
33#ifdef CONFIG_X86_PAE
34 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 33 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
35#else
36 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
37#endif
38 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 34 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
39 35
40#endif /*CONFIG_XEN */ 36#endif /*CONFIG_XEN */
diff --git a/drivers/char/drm/i915_drv.c b/drivers/char/drm/i915_drv.c
index e8f3d682e3b1..93aed1c38bd2 100644
--- a/drivers/char/drm/i915_drv.c
+++ b/drivers/char/drm/i915_drv.c
@@ -389,6 +389,7 @@ static int i915_resume(struct drm_device *dev)
389 pci_restore_state(dev->pdev); 389 pci_restore_state(dev->pdev);
390 if (pci_enable_device(dev->pdev)) 390 if (pci_enable_device(dev->pdev))
391 return -1; 391 return -1;
392 pci_set_master(dev->pdev);
392 393
393 pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB); 394 pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB);
394 395
diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c
index b1a757a5ee27..8f81139d6194 100644
--- a/drivers/char/tty_ioctl.c
+++ b/drivers/char/tty_ioctl.c
@@ -981,16 +981,9 @@ EXPORT_SYMBOL_GPL(tty_perform_flush);
981int n_tty_ioctl(struct tty_struct *tty, struct file *file, 981int n_tty_ioctl(struct tty_struct *tty, struct file *file,
982 unsigned int cmd, unsigned long arg) 982 unsigned int cmd, unsigned long arg)
983{ 983{
984 struct tty_struct *real_tty;
985 unsigned long flags; 984 unsigned long flags;
986 int retval; 985 int retval;
987 986
988 if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
989 tty->driver->subtype == PTY_TYPE_MASTER)
990 real_tty = tty->link;
991 else
992 real_tty = tty;
993
994 switch (cmd) { 987 switch (cmd) {
995 case TCXONC: 988 case TCXONC:
996 retval = tty_check_change(tty); 989 retval = tty_check_change(tty);
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index b224079d4e1f..d5862e5d99a0 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -109,7 +109,11 @@ static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_m
109{ 109{
110 struct page *page; 110 struct page *page;
111 111
112 page = alloc_pages(gfp_mask, order); 112 /*
113 * Use __GFP_ZERO because buggy firmware assumes ICM pages are
114 * cleared, and subtle failures are seen if they aren't.
115 */
116 page = alloc_pages(gfp_mask | __GFP_ZERO, order);
113 if (!page) 117 if (!page)
114 return -ENOMEM; 118 return -ENOMEM;
115 119
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 5126d5d9ea0e..2e554a4ab337 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -176,7 +176,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
176 * we set it now, so we can trap and pass that trap to the Guest if it 176 * we set it now, so we can trap and pass that trap to the Guest if it
177 * uses the FPU. */ 177 * uses the FPU. */
178 if (cpu->ts) 178 if (cpu->ts)
179 lguest_set_ts(); 179 unlazy_fpu(current);
180 180
181 /* SYSENTER is an optimized way of doing system calls. We can't allow 181 /* SYSENTER is an optimized way of doing system calls. We can't allow
182 * it because it always jumps to privilege level 0. A normal Guest 182 * it because it always jumps to privilege level 0. A normal Guest
@@ -196,6 +196,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
196 * trap made the switcher code come back, and an error code which some 196 * trap made the switcher code come back, and an error code which some
197 * traps set. */ 197 * traps set. */
198 198
199 /* Restore SYSENTER if it's supposed to be on. */
200 if (boot_cpu_has(X86_FEATURE_SEP))
201 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
202
199 /* If the Guest page faulted, then the cr2 register will tell us the 203 /* If the Guest page faulted, then the cr2 register will tell us the
200 * bad virtual address. We have to grab this now, because once we 204 * bad virtual address. We have to grab this now, because once we
201 * re-enable interrupts an interrupt could fault and thus overwrite 205 * re-enable interrupts an interrupt could fault and thus overwrite
@@ -203,13 +207,12 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
203 if (cpu->regs->trapnum == 14) 207 if (cpu->regs->trapnum == 14)
204 cpu->arch.last_pagefault = read_cr2(); 208 cpu->arch.last_pagefault = read_cr2();
205 /* Similarly, if we took a trap because the Guest used the FPU, 209 /* Similarly, if we took a trap because the Guest used the FPU,
206 * we have to restore the FPU it expects to see. */ 210 * we have to restore the FPU it expects to see.
211 * math_state_restore() may sleep and we may even move off to
212 * a different CPU. So all the critical stuff should be done
213 * before this. */
207 else if (cpu->regs->trapnum == 7) 214 else if (cpu->regs->trapnum == 7)
208 math_state_restore(); 215 math_state_restore();
209
210 /* Restore SYSENTER if it's supposed to be on. */
211 if (boot_cpu_has(X86_FEATURE_SEP))
212 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
213} 216}
214 217
215/*H:130 Now we've examined the hypercall code; our Guest can make requests. 218/*H:130 Now we've examined the hypercall code; our Guest can make requests.
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index 8662a6b7a30b..25b352b664d9 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -68,7 +68,6 @@ obj-$(CONFIG_WAFER_WDT) += wafer5823wdt.o
68obj-$(CONFIG_I6300ESB_WDT) += i6300esb.o 68obj-$(CONFIG_I6300ESB_WDT) += i6300esb.o
69obj-$(CONFIG_ITCO_WDT) += iTCO_wdt.o iTCO_vendor_support.o 69obj-$(CONFIG_ITCO_WDT) += iTCO_wdt.o iTCO_vendor_support.o
70obj-$(CONFIG_IT8712F_WDT) += it8712f_wdt.o 70obj-$(CONFIG_IT8712F_WDT) += it8712f_wdt.o
71CFLAGS_hpwdt.o += -O
72obj-$(CONFIG_HP_WATCHDOG) += hpwdt.o 71obj-$(CONFIG_HP_WATCHDOG) += hpwdt.o
73obj-$(CONFIG_SC1200_WDT) += sc1200wdt.o 72obj-$(CONFIG_SC1200_WDT) += sc1200wdt.o
74obj-$(CONFIG_SCx200_WDT) += scx200_wdt.o 73obj-$(CONFIG_SCx200_WDT) += scx200_wdt.o
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 4f0f22b020ea..76e5b7386af9 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -529,7 +529,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
529 529
530#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ 530#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
531 /* Clear master flag /before/ clearing selector flag. */ 531 /* Clear master flag /before/ clearing selector flag. */
532 rmb(); 532 wmb();
533#endif 533#endif
534 pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); 534 pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
535 while (pending_words != 0) { 535 while (pending_words != 0) {
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index c19184f2e70e..bec76b1c2bb0 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -246,15 +246,11 @@ static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
246 246
247} 247}
248 248
249static inline unsigned int zero_metapath_length(const struct metapath *mp, 249static inline unsigned int metapath_branch_start(const struct metapath *mp)
250 unsigned height)
251{ 250{
252 unsigned int i; 251 if (mp->mp_list[0] == 0)
253 for (i = 0; i < height - 1; i++) { 252 return 2;
254 if (mp->mp_list[i] != 0) 253 return 1;
255 return i;
256 }
257 return height;
258} 254}
259 255
260/** 256/**
@@ -436,7 +432,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
436 struct gfs2_sbd *sdp = GFS2_SB(inode); 432 struct gfs2_sbd *sdp = GFS2_SB(inode);
437 struct buffer_head *dibh = mp->mp_bh[0]; 433 struct buffer_head *dibh = mp->mp_bh[0];
438 u64 bn, dblock = 0; 434 u64 bn, dblock = 0;
439 unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0; 435 unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
440 unsigned dblks = 0; 436 unsigned dblks = 0;
441 unsigned ptrs_per_blk; 437 unsigned ptrs_per_blk;
442 const unsigned end_of_metadata = height - 1; 438 const unsigned end_of_metadata = height - 1;
@@ -471,9 +467,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
471 /* Building up tree height */ 467 /* Building up tree height */
472 state = ALLOC_GROW_HEIGHT; 468 state = ALLOC_GROW_HEIGHT;
473 iblks = height - ip->i_height; 469 iblks = height - ip->i_height;
474 zmpl = zero_metapath_length(mp, height); 470 branch_start = metapath_branch_start(mp);
475 iblks -= zmpl; 471 iblks += (height - branch_start);
476 iblks += height;
477 } 472 }
478 } 473 }
479 474
@@ -509,13 +504,13 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
509 sizeof(struct gfs2_meta_header)); 504 sizeof(struct gfs2_meta_header));
510 *ptr = zero_bn; 505 *ptr = zero_bn;
511 state = ALLOC_GROW_DEPTH; 506 state = ALLOC_GROW_DEPTH;
512 for(i = zmpl; i < height; i++) { 507 for(i = branch_start; i < height; i++) {
513 if (mp->mp_bh[i] == NULL) 508 if (mp->mp_bh[i] == NULL)
514 break; 509 break;
515 brelse(mp->mp_bh[i]); 510 brelse(mp->mp_bh[i]);
516 mp->mp_bh[i] = NULL; 511 mp->mp_bh[i] = NULL;
517 } 512 }
518 i = zmpl; 513 i = branch_start;
519 } 514 }
520 if (n == 0) 515 if (n == 0)
521 break; 516 break;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 6387523a3153..3401628d742b 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -195,7 +195,7 @@ ulong_aligned:
195 depending on architecture. I've experimented with several ways 195 depending on architecture. I've experimented with several ways
196 of writing this section such as using an else before the goto 196 of writing this section such as using an else before the goto
197 but this one seems to be the fastest. */ 197 but this one seems to be the fastest. */
198 while ((unsigned char *)plong < end - 1) { 198 while ((unsigned char *)plong < end - sizeof(unsigned long)) {
199 prefetch(plong + 1); 199 prefetch(plong + 1);
200 if (((*plong) & LBITMASK) != lskipval) 200 if (((*plong) & LBITMASK) != lskipval)
201 break; 201 break;
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 49c7cd0502cc..779d2eb649c5 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -130,10 +130,11 @@ static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p,
130 struct mnt_fhstatus *res) 130 struct mnt_fhstatus *res)
131{ 131{
132 struct nfs_fh *fh = res->fh; 132 struct nfs_fh *fh = res->fh;
133 unsigned size;
133 134
134 if ((res->status = ntohl(*p++)) == 0) { 135 if ((res->status = ntohl(*p++)) == 0) {
135 int size = ntohl(*p++); 136 size = ntohl(*p++);
136 if (size <= NFS3_FHSIZE) { 137 if (size <= NFS3_FHSIZE && size != 0) {
137 fh->size = size; 138 fh->size = size;
138 memcpy(fh->data, p, size); 139 memcpy(fh->data, p, size);
139 } else 140 } else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2a4a024a4e7b..614efeed5437 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1216,8 +1216,6 @@ static int nfs_validate_mount_data(void *options,
1216{ 1216{
1217 struct nfs_mount_data *data = (struct nfs_mount_data *)options; 1217 struct nfs_mount_data *data = (struct nfs_mount_data *)options;
1218 1218
1219 memset(args, 0, sizeof(*args));
1220
1221 if (data == NULL) 1219 if (data == NULL)
1222 goto out_no_data; 1220 goto out_no_data;
1223 1221
@@ -1251,13 +1249,13 @@ static int nfs_validate_mount_data(void *options,
1251 case 5: 1249 case 5:
1252 memset(data->context, 0, sizeof(data->context)); 1250 memset(data->context, 0, sizeof(data->context));
1253 case 6: 1251 case 6:
1254 if (data->flags & NFS_MOUNT_VER3) 1252 if (data->flags & NFS_MOUNT_VER3) {
1253 if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
1254 goto out_invalid_fh;
1255 mntfh->size = data->root.size; 1255 mntfh->size = data->root.size;
1256 else 1256 } else
1257 mntfh->size = NFS2_FHSIZE; 1257 mntfh->size = NFS2_FHSIZE;
1258 1258
1259 if (mntfh->size > sizeof(mntfh->data))
1260 goto out_invalid_fh;
1261 1259
1262 memcpy(mntfh->data, data->root.data, mntfh->size); 1260 memcpy(mntfh->data, data->root.data, mntfh->size);
1263 if (mntfh->size < sizeof(mntfh->data)) 1261 if (mntfh->size < sizeof(mntfh->data))
@@ -1585,24 +1583,29 @@ static int nfs_get_sb(struct file_system_type *fs_type,
1585{ 1583{
1586 struct nfs_server *server = NULL; 1584 struct nfs_server *server = NULL;
1587 struct super_block *s; 1585 struct super_block *s;
1588 struct nfs_fh mntfh; 1586 struct nfs_parsed_mount_data *data;
1589 struct nfs_parsed_mount_data data; 1587 struct nfs_fh *mntfh;
1590 struct dentry *mntroot; 1588 struct dentry *mntroot;
1591 int (*compare_super)(struct super_block *, void *) = nfs_compare_super; 1589 int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
1592 struct nfs_sb_mountdata sb_mntdata = { 1590 struct nfs_sb_mountdata sb_mntdata = {
1593 .mntflags = flags, 1591 .mntflags = flags,
1594 }; 1592 };
1595 int error; 1593 int error = -ENOMEM;
1594
1595 data = kzalloc(sizeof(*data), GFP_KERNEL);
1596 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
1597 if (data == NULL || mntfh == NULL)
1598 goto out_free_fh;
1596 1599
1597 security_init_mnt_opts(&data.lsm_opts); 1600 security_init_mnt_opts(&data->lsm_opts);
1598 1601
1599 /* Validate the mount data */ 1602 /* Validate the mount data */
1600 error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name); 1603 error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
1601 if (error < 0) 1604 if (error < 0)
1602 goto out; 1605 goto out;
1603 1606
1604 /* Get a volume representation */ 1607 /* Get a volume representation */
1605 server = nfs_create_server(&data, &mntfh); 1608 server = nfs_create_server(data, mntfh);
1606 if (IS_ERR(server)) { 1609 if (IS_ERR(server)) {
1607 error = PTR_ERR(server); 1610 error = PTR_ERR(server);
1608 goto out; 1611 goto out;
@@ -1630,16 +1633,16 @@ static int nfs_get_sb(struct file_system_type *fs_type,
1630 1633
1631 if (!s->s_root) { 1634 if (!s->s_root) {
1632 /* initial superblock/root creation */ 1635 /* initial superblock/root creation */
1633 nfs_fill_super(s, &data); 1636 nfs_fill_super(s, data);
1634 } 1637 }
1635 1638
1636 mntroot = nfs_get_root(s, &mntfh); 1639 mntroot = nfs_get_root(s, mntfh);
1637 if (IS_ERR(mntroot)) { 1640 if (IS_ERR(mntroot)) {
1638 error = PTR_ERR(mntroot); 1641 error = PTR_ERR(mntroot);
1639 goto error_splat_super; 1642 goto error_splat_super;
1640 } 1643 }
1641 1644
1642 error = security_sb_set_mnt_opts(s, &data.lsm_opts); 1645 error = security_sb_set_mnt_opts(s, &data->lsm_opts);
1643 if (error) 1646 if (error)
1644 goto error_splat_root; 1647 goto error_splat_root;
1645 1648
@@ -1649,9 +1652,12 @@ static int nfs_get_sb(struct file_system_type *fs_type,
1649 error = 0; 1652 error = 0;
1650 1653
1651out: 1654out:
1652 kfree(data.nfs_server.hostname); 1655 kfree(data->nfs_server.hostname);
1653 kfree(data.mount_server.hostname); 1656 kfree(data->mount_server.hostname);
1654 security_free_mnt_opts(&data.lsm_opts); 1657 security_free_mnt_opts(&data->lsm_opts);
1658out_free_fh:
1659 kfree(mntfh);
1660 kfree(data);
1655 return error; 1661 return error;
1656 1662
1657out_err_nosb: 1663out_err_nosb:
@@ -1800,8 +1806,6 @@ static int nfs4_validate_mount_data(void *options,
1800 struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; 1806 struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
1801 char *c; 1807 char *c;
1802 1808
1803 memset(args, 0, sizeof(*args));
1804
1805 if (data == NULL) 1809 if (data == NULL)
1806 goto out_no_data; 1810 goto out_no_data;
1807 1811
@@ -1959,26 +1963,31 @@ out_no_client_address:
1959static int nfs4_get_sb(struct file_system_type *fs_type, 1963static int nfs4_get_sb(struct file_system_type *fs_type,
1960 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) 1964 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
1961{ 1965{
1962 struct nfs_parsed_mount_data data; 1966 struct nfs_parsed_mount_data *data;
1963 struct super_block *s; 1967 struct super_block *s;
1964 struct nfs_server *server; 1968 struct nfs_server *server;
1965 struct nfs_fh mntfh; 1969 struct nfs_fh *mntfh;
1966 struct dentry *mntroot; 1970 struct dentry *mntroot;
1967 int (*compare_super)(struct super_block *, void *) = nfs_compare_super; 1971 int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
1968 struct nfs_sb_mountdata sb_mntdata = { 1972 struct nfs_sb_mountdata sb_mntdata = {
1969 .mntflags = flags, 1973 .mntflags = flags,
1970 }; 1974 };
1971 int error; 1975 int error = -ENOMEM;
1972 1976
1973 security_init_mnt_opts(&data.lsm_opts); 1977 data = kzalloc(sizeof(*data), GFP_KERNEL);
1978 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
1979 if (data == NULL || mntfh == NULL)
1980 goto out_free_fh;
1981
1982 security_init_mnt_opts(&data->lsm_opts);
1974 1983
1975 /* Validate the mount data */ 1984 /* Validate the mount data */
1976 error = nfs4_validate_mount_data(raw_data, &data, dev_name); 1985 error = nfs4_validate_mount_data(raw_data, data, dev_name);
1977 if (error < 0) 1986 if (error < 0)
1978 goto out; 1987 goto out;
1979 1988
1980 /* Get a volume representation */ 1989 /* Get a volume representation */
1981 server = nfs4_create_server(&data, &mntfh); 1990 server = nfs4_create_server(data, mntfh);
1982 if (IS_ERR(server)) { 1991 if (IS_ERR(server)) {
1983 error = PTR_ERR(server); 1992 error = PTR_ERR(server);
1984 goto out; 1993 goto out;
@@ -2009,13 +2018,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2009 nfs4_fill_super(s); 2018 nfs4_fill_super(s);
2010 } 2019 }
2011 2020
2012 mntroot = nfs4_get_root(s, &mntfh); 2021 mntroot = nfs4_get_root(s, mntfh);
2013 if (IS_ERR(mntroot)) { 2022 if (IS_ERR(mntroot)) {
2014 error = PTR_ERR(mntroot); 2023 error = PTR_ERR(mntroot);
2015 goto error_splat_super; 2024 goto error_splat_super;
2016 } 2025 }
2017 2026
2018 error = security_sb_set_mnt_opts(s, &data.lsm_opts); 2027 error = security_sb_set_mnt_opts(s, &data->lsm_opts);
2019 if (error) 2028 if (error)
2020 goto error_splat_root; 2029 goto error_splat_root;
2021 2030
@@ -2025,10 +2034,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2025 error = 0; 2034 error = 0;
2026 2035
2027out: 2036out:
2028 kfree(data.client_address); 2037 kfree(data->client_address);
2029 kfree(data.nfs_server.export_path); 2038 kfree(data->nfs_server.export_path);
2030 kfree(data.nfs_server.hostname); 2039 kfree(data->nfs_server.hostname);
2031 security_free_mnt_opts(&data.lsm_opts); 2040 security_free_mnt_opts(&data->lsm_opts);
2041out_free_fh:
2042 kfree(mntfh);
2043 kfree(data);
2032 return error; 2044 return error;
2033 2045
2034out_free: 2046out_free:
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 6d8ace3e3259..f333848fd3be 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -739,12 +739,13 @@ int nfs_updatepage(struct file *file, struct page *page,
739 } 739 }
740 740
741 status = nfs_writepage_setup(ctx, page, offset, count); 741 status = nfs_writepage_setup(ctx, page, offset, count);
742 __set_page_dirty_nobuffers(page); 742 if (status < 0)
743 nfs_set_pageerror(page);
744 else
745 __set_page_dirty_nobuffers(page);
743 746
744 dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n", 747 dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n",
745 status, (long long)i_size_read(inode)); 748 status, (long long)i_size_read(inode));
746 if (status < 0)
747 nfs_set_pageerror(page);
748 return status; 749 return status;
749} 750}
750 751
diff --git a/fs/select.c b/fs/select.c
index 8dda969614a9..da0e88201c3a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -249,7 +249,6 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
249 retval++; 249 retval++;
250 } 250 }
251 } 251 }
252 cond_resched();
253 } 252 }
254 if (res_in) 253 if (res_in)
255 *rinp = res_in; 254 *rinp = res_in;
@@ -257,6 +256,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
257 *routp = res_out; 256 *routp = res_out;
258 if (res_ex) 257 if (res_ex)
259 *rexp = res_ex; 258 *rexp = res_ex;
259 cond_resched();
260 } 260 }
261 wait = NULL; 261 wait = NULL;
262 if (retval || !*timeout || signal_pending(current)) 262 if (retval || !*timeout || signal_pending(current))
diff --git a/include/asm-alpha/percpu.h b/include/asm-alpha/percpu.h
index 82e8a94b4b2f..3495e8e00d70 100644
--- a/include/asm-alpha/percpu.h
+++ b/include/asm-alpha/percpu.h
@@ -69,6 +69,8 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
69#define __get_cpu_var(var) per_cpu_var(var) 69#define __get_cpu_var(var) per_cpu_var(var)
70#define __raw_get_cpu_var(var) per_cpu_var(var) 70#define __raw_get_cpu_var(var) per_cpu_var(var)
71 71
72#define PER_CPU_ATTRIBUTES
73
72#endif /* SMP */ 74#endif /* SMP */
73 75
74#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu_var(name) 76#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu_var(name)
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 1d8cd01fa514..844f2a89afbc 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -18,6 +18,7 @@
18#include <linux/kvm_para.h> 18#include <linux/kvm_para.h>
19#include <linux/kvm_types.h> 19#include <linux/kvm_types.h>
20 20
21#include <asm/pvclock-abi.h>
21#include <asm/desc.h> 22#include <asm/desc.h>
22 23
23#define KVM_MAX_VCPUS 16 24#define KVM_MAX_VCPUS 16
@@ -282,7 +283,8 @@ struct kvm_vcpu_arch {
282 struct x86_emulate_ctxt emulate_ctxt; 283 struct x86_emulate_ctxt emulate_ctxt;
283 284
284 gpa_t time; 285 gpa_t time;
285 struct kvm_vcpu_time_info hv_clock; 286 struct pvclock_vcpu_time_info hv_clock;
287 unsigned int hv_clock_tsc_khz;
286 unsigned int time_offset; 288 unsigned int time_offset;
287 struct page *time_page; 289 struct page *time_page;
288}; 290};
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
index 509845942070..bfd9900742bf 100644
--- a/include/asm-x86/kvm_para.h
+++ b/include/asm-x86/kvm_para.h
@@ -48,24 +48,6 @@ struct kvm_mmu_op_release_pt {
48#ifdef __KERNEL__ 48#ifdef __KERNEL__
49#include <asm/processor.h> 49#include <asm/processor.h>
50 50
51/* xen binary-compatible interface. See xen headers for details */
52struct kvm_vcpu_time_info {
53 uint32_t version;
54 uint32_t pad0;
55 uint64_t tsc_timestamp;
56 uint64_t system_time;
57 uint32_t tsc_to_system_mul;
58 int8_t tsc_shift;
59 int8_t pad[3];
60} __attribute__((__packed__)); /* 32 bytes */
61
62struct kvm_wall_clock {
63 uint32_t wc_version;
64 uint32_t wc_sec;
65 uint32_t wc_nsec;
66} __attribute__((__packed__));
67
68
69extern void kvmclock_init(void); 51extern void kvmclock_init(void);
70 52
71 53
diff --git a/include/asm-x86/pvclock-abi.h b/include/asm-x86/pvclock-abi.h
new file mode 100644
index 000000000000..6857f840b243
--- /dev/null
+++ b/include/asm-x86/pvclock-abi.h
@@ -0,0 +1,42 @@
1#ifndef _ASM_X86_PVCLOCK_ABI_H_
2#define _ASM_X86_PVCLOCK_ABI_H_
3#ifndef __ASSEMBLY__
4
5/*
6 * These structs MUST NOT be changed.
7 * They are the ABI between hypervisor and guest OS.
8 * Both Xen and KVM are using this.
9 *
10 * pvclock_vcpu_time_info holds the system time and the tsc timestamp
11 * of the last update. So the guest can use the tsc delta to get a
12 * more precise system time. There is one per virtual cpu.
13 *
14 * pvclock_wall_clock references the point in time when the system
15 * time was zero (usually boot time), thus the guest calculates the
16 * current wall clock by adding the system time.
17 *
18 * Protocol for the "version" fields is: hypervisor raises it (making
19 * it uneven) before it starts updating the fields and raises it again
20 * (making it even) when it is done. Thus the guest can make sure the
21 * time values it got are consistent by checking the version before
22 * and after reading them.
23 */
24
25struct pvclock_vcpu_time_info {
26 u32 version;
27 u32 pad0;
28 u64 tsc_timestamp;
29 u64 system_time;
30 u32 tsc_to_system_mul;
31 s8 tsc_shift;
32 u8 pad[3];
33} __attribute__((__packed__)); /* 32 bytes */
34
35struct pvclock_wall_clock {
36 u32 version;
37 u32 sec;
38 u32 nsec;
39} __attribute__((__packed__));
40
41#endif /* __ASSEMBLY__ */
42#endif /* _ASM_X86_PVCLOCK_ABI_H_ */
diff --git a/include/asm-x86/pvclock.h b/include/asm-x86/pvclock.h
new file mode 100644
index 000000000000..85b1bba8e0a3
--- /dev/null
+++ b/include/asm-x86/pvclock.h
@@ -0,0 +1,13 @@
1#ifndef _ASM_X86_PVCLOCK_H_
2#define _ASM_X86_PVCLOCK_H_
3
4#include <linux/clocksource.h>
5#include <asm/pvclock-abi.h>
6
7/* some helper functions for xen and kvm pv clock sources */
8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
9void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
10 struct pvclock_vcpu_time_info *vcpu,
11 struct timespec *ts);
12
13#endif /* _ASM_X86_PVCLOCK_H_ */
diff --git a/include/asm-x86/xen/page.h b/include/asm-x86/xen/page.h
index baf3a4dce28c..e11f24038b1d 100644
--- a/include/asm-x86/xen/page.h
+++ b/include/asm-x86/xen/page.h
@@ -150,13 +150,9 @@ static inline pte_t __pte_ma(pteval_t x)
150 return (pte_t) { .pte = x }; 150 return (pte_t) { .pte = x };
151} 151}
152 152
153#ifdef CONFIG_X86_PAE
154#define pmd_val_ma(v) ((v).pmd) 153#define pmd_val_ma(v) ((v).pmd)
155#define pud_val_ma(v) ((v).pgd.pgd) 154#define pud_val_ma(v) ((v).pgd.pgd)
156#define __pmd_ma(x) ((pmd_t) { (x) } ) 155#define __pmd_ma(x) ((pmd_t) { (x) } )
157#else /* !X86_PAE */
158#define pmd_val_ma(v) ((v).pud.pgd.pgd)
159#endif /* CONFIG_X86_PAE */
160 156
161#define pgd_val_ma(x) ((x).pgd) 157#define pgd_val_ma(x) ((x).pgd)
162 158
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 092b1b25291d..de9d1df4bba2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -33,6 +33,7 @@
33#define KVM_REQ_REPORT_TPR_ACCESS 2 33#define KVM_REQ_REPORT_TPR_ACCESS 2
34#define KVM_REQ_MMU_RELOAD 3 34#define KVM_REQ_MMU_RELOAD 3
35#define KVM_REQ_TRIPLE_FAULT 4 35#define KVM_REQ_TRIPLE_FAULT 4
36#define KVM_REQ_PENDING_TIMER 5
36 37
37struct kvm_vcpu; 38struct kvm_vcpu;
38extern struct kmem_cache *kvm_vcpu_cache; 39extern struct kmem_cache *kvm_vcpu_cache;
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 59f1c0bd8f9c..d2a003586761 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -27,8 +27,7 @@
27 * This routine is called by the kernel to write a series of 27 * This routine is called by the kernel to write a series of
28 * characters to the tty device. The characters may come from 28 * characters to the tty device. The characters may come from
29 * user space or kernel space. This routine will return the 29 * user space or kernel space. This routine will return the
30 * number of characters actually accepted for writing. This 30 * number of characters actually accepted for writing.
31 * routine is mandatory.
32 * 31 *
33 * Optional: Required for writable devices. 32 * Optional: Required for writable devices.
34 * 33 *
@@ -134,7 +133,7 @@
134 * This routine notifies the tty driver that it should hangup the 133 * This routine notifies the tty driver that it should hangup the
135 * tty device. 134 * tty device.
136 * 135 *
137 * Required: 136 * Optional:
138 * 137 *
139 * void (*break_ctl)(struct tty_stuct *tty, int state); 138 * void (*break_ctl)(struct tty_stuct *tty, int state);
140 * 139 *
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 9b018da48cf3..819a0331cda9 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -10,6 +10,7 @@
10#define __XEN_PUBLIC_XEN_H__ 10#define __XEN_PUBLIC_XEN_H__
11 11
12#include <asm/xen/interface.h> 12#include <asm/xen/interface.h>
13#include <asm/pvclock-abi.h>
13 14
14/* 15/*
15 * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS). 16 * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
@@ -336,7 +337,7 @@ struct vcpu_info {
336 uint8_t evtchn_upcall_mask; 337 uint8_t evtchn_upcall_mask;
337 unsigned long evtchn_pending_sel; 338 unsigned long evtchn_pending_sel;
338 struct arch_vcpu_info arch; 339 struct arch_vcpu_info arch;
339 struct vcpu_time_info time; 340 struct pvclock_vcpu_time_info time;
340}; /* 64 bytes (x86) */ 341}; /* 64 bytes (x86) */
341 342
342/* 343/*
@@ -384,9 +385,7 @@ struct shared_info {
384 * Wallclock time: updated only by control software. Guests should base 385 * Wallclock time: updated only by control software. Guests should base
385 * their gettimeofday() syscall on this wallclock-base value. 386 * their gettimeofday() syscall on this wallclock-base value.
386 */ 387 */
387 uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */ 388 struct pvclock_wall_clock wc;
388 uint32_t wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */
389 uint32_t wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */
390 389
391 struct arch_shared_info arch; 390 struct arch_shared_info arch;
392 391
diff --git a/kernel/futex.c b/kernel/futex.c
index 449def8074fe..7d1136e97c14 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1096,21 +1096,64 @@ static void unqueue_me_pi(struct futex_q *q)
1096 * private futexes. 1096 * private futexes.
1097 */ 1097 */
1098static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1098static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1099 struct task_struct *newowner) 1099 struct task_struct *newowner,
1100 struct rw_semaphore *fshared)
1100{ 1101{
1101 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1102 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1102 struct futex_pi_state *pi_state = q->pi_state; 1103 struct futex_pi_state *pi_state = q->pi_state;
1104 struct task_struct *oldowner = pi_state->owner;
1103 u32 uval, curval, newval; 1105 u32 uval, curval, newval;
1104 int ret; 1106 int ret, attempt = 0;
1105 1107
1106 /* Owner died? */ 1108 /* Owner died? */
1109 if (!pi_state->owner)
1110 newtid |= FUTEX_OWNER_DIED;
1111
1112 /*
1113 * We are here either because we stole the rtmutex from the
1114 * pending owner or we are the pending owner which failed to
1115 * get the rtmutex. We have to replace the pending owner TID
1116 * in the user space variable. This must be atomic as we have
1117 * to preserve the owner died bit here.
1118 *
1119 * Note: We write the user space value _before_ changing the
1120 * pi_state because we can fault here. Imagine swapped out
1121 * pages or a fork, which was running right before we acquired
1122 * mmap_sem, that marked all the anonymous memory readonly for
1123 * cow.
1124 *
1125 * Modifying pi_state _before_ the user space value would
1126 * leave the pi_state in an inconsistent state when we fault
1127 * here, because we need to drop the hash bucket lock to
1128 * handle the fault. This might be observed in the PID check
1129 * in lookup_pi_state.
1130 */
1131retry:
1132 if (get_futex_value_locked(&uval, uaddr))
1133 goto handle_fault;
1134
1135 while (1) {
1136 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1137
1138 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1139
1140 if (curval == -EFAULT)
1141 goto handle_fault;
1142 if (curval == uval)
1143 break;
1144 uval = curval;
1145 }
1146
1147 /*
1148 * We fixed up user space. Now we need to fix the pi_state
1149 * itself.
1150 */
1107 if (pi_state->owner != NULL) { 1151 if (pi_state->owner != NULL) {
1108 spin_lock_irq(&pi_state->owner->pi_lock); 1152 spin_lock_irq(&pi_state->owner->pi_lock);
1109 WARN_ON(list_empty(&pi_state->list)); 1153 WARN_ON(list_empty(&pi_state->list));
1110 list_del_init(&pi_state->list); 1154 list_del_init(&pi_state->list);
1111 spin_unlock_irq(&pi_state->owner->pi_lock); 1155 spin_unlock_irq(&pi_state->owner->pi_lock);
1112 } else 1156 }
1113 newtid |= FUTEX_OWNER_DIED;
1114 1157
1115 pi_state->owner = newowner; 1158 pi_state->owner = newowner;
1116 1159
@@ -1118,26 +1161,35 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1118 WARN_ON(!list_empty(&pi_state->list)); 1161 WARN_ON(!list_empty(&pi_state->list));
1119 list_add(&pi_state->list, &newowner->pi_state_list); 1162 list_add(&pi_state->list, &newowner->pi_state_list);
1120 spin_unlock_irq(&newowner->pi_lock); 1163 spin_unlock_irq(&newowner->pi_lock);
1164 return 0;
1121 1165
1122 /* 1166 /*
1123 * We own it, so we have to replace the pending owner 1167 * To handle the page fault we need to drop the hash bucket
1124 * TID. This must be atomic as we have preserve the 1168 * lock here. That gives the other task (either the pending
1125 * owner died bit here. 1169 * owner itself or the task which stole the rtmutex) the
1170 * chance to try the fixup of the pi_state. So once we are
1171 * back from handling the fault we need to check the pi_state
1172 * after reacquiring the hash bucket lock and before trying to
1173 * do another fixup. When the fixup has been done already we
1174 * simply return.
1126 */ 1175 */
1127 ret = get_futex_value_locked(&uval, uaddr); 1176handle_fault:
1177 spin_unlock(q->lock_ptr);
1128 1178
1129 while (!ret) { 1179 ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
1130 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1131 1180
1132 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 1181 spin_lock(q->lock_ptr);
1133 1182
1134 if (curval == -EFAULT) 1183 /*
1135 ret = -EFAULT; 1184 * Check if someone else fixed it for us:
1136 if (curval == uval) 1185 */
1137 break; 1186 if (pi_state->owner != oldowner)
1138 uval = curval; 1187 return 0;
1139 } 1188
1140 return ret; 1189 if (ret)
1190 return ret;
1191
1192 goto retry;
1141} 1193}
1142 1194
1143/* 1195/*
@@ -1507,7 +1559,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1507 * that case: 1559 * that case:
1508 */ 1560 */
1509 if (q.pi_state->owner != curr) 1561 if (q.pi_state->owner != curr)
1510 ret = fixup_pi_state_owner(uaddr, &q, curr); 1562 ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
1511 } else { 1563 } else {
1512 /* 1564 /*
1513 * Catch the rare case, where the lock was released 1565 * Catch the rare case, where the lock was released
@@ -1539,7 +1591,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1539 int res; 1591 int res;
1540 1592
1541 owner = rt_mutex_owner(&q.pi_state->pi_mutex); 1593 owner = rt_mutex_owner(&q.pi_state->pi_mutex);
1542 res = fixup_pi_state_owner(uaddr, &q, owner); 1594 res = fixup_pi_state_owner(uaddr, &q, owner,
1595 fshared);
1543 1596
1544 /* propagate -EFAULT, if the fixup failed */ 1597 /* propagate -EFAULT, if the fixup failed */
1545 if (res) 1598 if (res)
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 79e3c90113c2..3ec23c3ec97f 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -1499,7 +1499,8 @@ int kgdb_nmicallback(int cpu, void *regs)
1499 return 1; 1499 return 1;
1500} 1500}
1501 1501
1502void kgdb_console_write(struct console *co, const char *s, unsigned count) 1502static void kgdb_console_write(struct console *co, const char *s,
1503 unsigned count)
1503{ 1504{
1504 unsigned long flags; 1505 unsigned long flags;
1505 1506
diff --git a/kernel/sched.c b/kernel/sched.c
index adb2d01fccc2..c51d9fae8cd8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4403,22 +4403,20 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4403 signal_pending(current)) || 4403 signal_pending(current)) ||
4404 (state == TASK_KILLABLE && 4404 (state == TASK_KILLABLE &&
4405 fatal_signal_pending(current))) { 4405 fatal_signal_pending(current))) {
4406 __remove_wait_queue(&x->wait, &wait); 4406 timeout = -ERESTARTSYS;
4407 return -ERESTARTSYS; 4407 break;
4408 } 4408 }
4409 __set_current_state(state); 4409 __set_current_state(state);
4410 spin_unlock_irq(&x->wait.lock); 4410 spin_unlock_irq(&x->wait.lock);
4411 timeout = schedule_timeout(timeout); 4411 timeout = schedule_timeout(timeout);
4412 spin_lock_irq(&x->wait.lock); 4412 spin_lock_irq(&x->wait.lock);
4413 if (!timeout) { 4413 } while (!x->done && timeout);
4414 __remove_wait_queue(&x->wait, &wait);
4415 return timeout;
4416 }
4417 } while (!x->done);
4418 __remove_wait_queue(&x->wait, &wait); 4414 __remove_wait_queue(&x->wait, &wait);
4415 if (!x->done)
4416 return timeout;
4419 } 4417 }
4420 x->done--; 4418 x->done--;
4421 return timeout; 4419 return timeout ?: 1;
4422} 4420}
4423 4421
4424static long __sched 4422static long __sched
diff --git a/mm/memory.c b/mm/memory.c
index 9aefaae46858..d14b251a25a6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1045,6 +1045,26 @@ no_page_table:
1045 return page; 1045 return page;
1046} 1046}
1047 1047
1048/* Can we do the FOLL_ANON optimization? */
1049static inline int use_zero_page(struct vm_area_struct *vma)
1050{
1051 /*
1052 * We don't want to optimize FOLL_ANON for make_pages_present()
1053 * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
1054 * we want to get the page from the page tables to make sure
1055 * that we serialize and update with any other user of that
1056 * mapping.
1057 */
1058 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1059 return 0;
1060 /*
1061 * And if we have a fault or a nopfn routine, it's not an
1062 * anonymous region.
1063 */
1064 return !vma->vm_ops ||
1065 (!vma->vm_ops->fault && !vma->vm_ops->nopfn);
1066}
1067
1048int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1068int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1049 unsigned long start, int len, int write, int force, 1069 unsigned long start, int len, int write, int force,
1050 struct page **pages, struct vm_area_struct **vmas) 1070 struct page **pages, struct vm_area_struct **vmas)
@@ -1119,8 +1139,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1119 foll_flags = FOLL_TOUCH; 1139 foll_flags = FOLL_TOUCH;
1120 if (pages) 1140 if (pages)
1121 foll_flags |= FOLL_GET; 1141 foll_flags |= FOLL_GET;
1122 if (!write && !(vma->vm_flags & VM_LOCKED) && 1142 if (!write && use_zero_page(vma))
1123 (!vma->vm_ops || !vma->vm_ops->fault))
1124 foll_flags |= FOLL_ANON; 1143 foll_flags |= FOLL_ANON;
1125 1144
1126 do { 1145 do {
@@ -1766,7 +1785,6 @@ gotten:
1766 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 1785 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1767 if (likely(pte_same(*page_table, orig_pte))) { 1786 if (likely(pte_same(*page_table, orig_pte))) {
1768 if (old_page) { 1787 if (old_page) {
1769 page_remove_rmap(old_page, vma);
1770 if (!PageAnon(old_page)) { 1788 if (!PageAnon(old_page)) {
1771 dec_mm_counter(mm, file_rss); 1789 dec_mm_counter(mm, file_rss);
1772 inc_mm_counter(mm, anon_rss); 1790 inc_mm_counter(mm, anon_rss);
@@ -1788,6 +1806,32 @@ gotten:
1788 lru_cache_add_active(new_page); 1806 lru_cache_add_active(new_page);
1789 page_add_new_anon_rmap(new_page, vma, address); 1807 page_add_new_anon_rmap(new_page, vma, address);
1790 1808
1809 if (old_page) {
1810 /*
1811 * Only after switching the pte to the new page may
1812 * we remove the mapcount here. Otherwise another
1813 * process may come and find the rmap count decremented
1814 * before the pte is switched to the new page, and
1815 * "reuse" the old page writing into it while our pte
1816 * here still points into it and can be read by other
1817 * threads.
1818 *
1819 * The critical issue is to order this
1820 * page_remove_rmap with the ptp_clear_flush above.
1821 * Those stores are ordered by (if nothing else,)
1822 * the barrier present in the atomic_add_negative
1823 * in page_remove_rmap.
1824 *
1825 * Then the TLB flush in ptep_clear_flush ensures that
1826 * no process can access the old page before the
1827 * decremented mapcount is visible. And the old page
1828 * cannot be reused until after the decremented
1829 * mapcount is visible. So transitively, TLBs to
1830 * old page will be flushed before it can be reused.
1831 */
1832 page_remove_rmap(old_page, vma);
1833 }
1834
1791 /* Free the old page.. */ 1835 /* Free the old page.. */
1792 new_page = old_page; 1836 new_page = old_page;
1793 ret |= VM_FAULT_WRITE; 1837 ret |= VM_FAULT_WRITE;
diff --git a/sound/isa/sb/sb_mixer.c b/sound/isa/sb/sb_mixer.c
index 91d14224f6b3..73d4572d136b 100644
--- a/sound/isa/sb/sb_mixer.c
+++ b/sound/isa/sb/sb_mixer.c
@@ -925,7 +925,7 @@ static unsigned char als4000_saved_regs[] = {
925static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs) 925static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs)
926{ 926{
927 unsigned char *val = chip->saved_regs; 927 unsigned char *val = chip->saved_regs;
928 snd_assert(num_regs > ARRAY_SIZE(chip->saved_regs), return); 928 snd_assert(num_regs <= ARRAY_SIZE(chip->saved_regs), return);
929 for (; num_regs; num_regs--) 929 for (; num_regs; num_regs--)
930 *val++ = snd_sbmixer_read(chip, *regs++); 930 *val++ = snd_sbmixer_read(chip, *regs++);
931} 931}
@@ -933,7 +933,7 @@ static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs)
933static void restore_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs) 933static void restore_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs)
934{ 934{
935 unsigned char *val = chip->saved_regs; 935 unsigned char *val = chip->saved_regs;
936 snd_assert(num_regs > ARRAY_SIZE(chip->saved_regs), return); 936 snd_assert(num_regs <= ARRAY_SIZE(chip->saved_regs), return);
937 for (; num_regs; num_regs--) 937 for (; num_regs; num_regs--)
938 snd_sbmixer_write(chip, *regs++, *val++); 938 snd_sbmixer_write(chip, *regs++, *val++);
939} 939}
diff --git a/sound/pci/aw2/aw2-alsa.c b/sound/pci/aw2/aw2-alsa.c
index 56f87cd33c19..3f00ddf450f8 100644
--- a/sound/pci/aw2/aw2-alsa.c
+++ b/sound/pci/aw2/aw2-alsa.c
@@ -316,6 +316,8 @@ static int __devinit snd_aw2_create(struct snd_card *card,
316 return -ENOMEM; 316 return -ENOMEM;
317 } 317 }
318 318
319 /* (2) initialization of the chip hardware */
320 snd_aw2_saa7146_setup(&chip->saa7146, chip->iobase_virt);
319 321
320 if (request_irq(pci->irq, snd_aw2_saa7146_interrupt, 322 if (request_irq(pci->irq, snd_aw2_saa7146_interrupt,
321 IRQF_SHARED, "Audiowerk2", chip)) { 323 IRQF_SHARED, "Audiowerk2", chip)) {
@@ -329,8 +331,6 @@ static int __devinit snd_aw2_create(struct snd_card *card,
329 } 331 }
330 chip->irq = pci->irq; 332 chip->irq = pci->irq;
331 333
332 /* (2) initialization of the chip hardware */
333 snd_aw2_saa7146_setup(&chip->saa7146, chip->iobase_virt);
334 err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops); 334 err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops);
335 if (err < 0) { 335 if (err < 0) {
336 free_irq(chip->irq, (void *)chip); 336 free_irq(chip->irq, (void *)chip);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 98778cb69c6e..1dcf9f3d1107 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -269,28 +269,9 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
269 } 269 }
270} 270}
271 271
272static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector) 272static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi)
273{ 273{
274 int i;
275
276 for (i = 0; i < IOAPIC_NUM_PINS; i++)
277 if (ioapic->redirtbl[i].fields.vector == vector)
278 return i;
279 return -1;
280}
281
282void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
283{
284 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
285 union ioapic_redir_entry *ent; 274 union ioapic_redir_entry *ent;
286 int gsi;
287
288 gsi = get_eoi_gsi(ioapic, vector);
289 if (gsi == -1) {
290 printk(KERN_WARNING "Can't find redir item for %d EOI\n",
291 vector);
292 return;
293 }
294 275
295 ent = &ioapic->redirtbl[gsi]; 276 ent = &ioapic->redirtbl[gsi];
296 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); 277 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
@@ -300,6 +281,16 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
300 ioapic_deliver(ioapic, gsi); 281 ioapic_deliver(ioapic, gsi);
301} 282}
302 283
284void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
285{
286 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
287 int i;
288
289 for (i = 0; i < IOAPIC_NUM_PINS; i++)
290 if (ioapic->redirtbl[i].fields.vector == vector)
291 __kvm_ioapic_update_eoi(ioapic, i);
292}
293
303static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr) 294static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)
304{ 295{
305 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; 296 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;