diff options
49 files changed, 646 insertions, 499 deletions
diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl index 028a8444d95e..e8acd1f03456 100644 --- a/Documentation/DocBook/kgdb.tmpl +++ b/Documentation/DocBook/kgdb.tmpl | |||
@@ -84,10 +84,9 @@ | |||
84 | runs an instance of gdb against the vmlinux file which contains | 84 | runs an instance of gdb against the vmlinux file which contains |
85 | the symbols (not boot image such as bzImage, zImage, uImage...). | 85 | the symbols (not boot image such as bzImage, zImage, uImage...). |
86 | In gdb the developer specifies the connection parameters and | 86 | In gdb the developer specifies the connection parameters and |
87 | connects to kgdb. Depending on which kgdb I/O modules exist in | 87 | connects to kgdb. The type of connection a developer makes with |
88 | the kernel for a given architecture, it may be possible to debug | 88 | gdb depends on the availability of kgdb I/O modules compiled as |
89 | the test machine's kernel with the development machine using a | 89 | builtin's or kernel modules in the test machine's kernel. |
90 | rs232 or ethernet connection. | ||
91 | </para> | 90 | </para> |
92 | </chapter> | 91 | </chapter> |
93 | <chapter id="CompilingAKernel"> | 92 | <chapter id="CompilingAKernel"> |
@@ -223,7 +222,7 @@ | |||
223 | </para> | 222 | </para> |
224 | <para> | 223 | <para> |
225 | IMPORTANT NOTE: Using this option with kgdb over the console | 224 | IMPORTANT NOTE: Using this option with kgdb over the console |
226 | (kgdboc) or kgdb over ethernet (kgdboe) is not supported. | 225 | (kgdboc) is not supported. |
227 | </para> | 226 | </para> |
228 | </sect1> | 227 | </sect1> |
229 | </chapter> | 228 | </chapter> |
@@ -249,18 +248,11 @@ | |||
249 | (gdb) target remote /dev/ttyS0 | 248 | (gdb) target remote /dev/ttyS0 |
250 | </programlisting> | 249 | </programlisting> |
251 | <para> | 250 | <para> |
252 | Example (kgdb to a terminal server): | 251 | Example (kgdb to a terminal server on tcp port 2012): |
253 | </para> | 252 | </para> |
254 | <programlisting> | 253 | <programlisting> |
255 | % gdb ./vmlinux | 254 | % gdb ./vmlinux |
256 | (gdb) target remote udp:192.168.2.2:6443 | 255 | (gdb) target remote 192.168.2.2:2012 |
257 | </programlisting> | ||
258 | <para> | ||
259 | Example (kgdb over ethernet): | ||
260 | </para> | ||
261 | <programlisting> | ||
262 | % gdb ./vmlinux | ||
263 | (gdb) target remote udp:192.168.2.2:6443 | ||
264 | </programlisting> | 256 | </programlisting> |
265 | <para> | 257 | <para> |
266 | Once connected, you can debug a kernel the way you would debug an | 258 | Once connected, you can debug a kernel the way you would debug an |
@@ -1,7 +1,7 @@ | |||
1 | VERSION = 2 | 1 | VERSION = 2 |
2 | PATCHLEVEL = 6 | 2 | PATCHLEVEL = 6 |
3 | SUBLEVEL = 26 | 3 | SUBLEVEL = 26 |
4 | EXTRAVERSION = -rc7 | 4 | EXTRAVERSION = -rc8 |
5 | NAME = Rotary Wombat | 5 | NAME = Rotary Wombat |
6 | 6 | ||
7 | # *DOCUMENTATION* | 7 | # *DOCUMENTATION* |
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c index 082c31dcfd99..39752cdef6ff 100644 --- a/arch/ia64/kernel/iosapic.c +++ b/arch/ia64/kernel/iosapic.c | |||
@@ -558,8 +558,6 @@ static struct iosapic_rte_info * __init_refok iosapic_alloc_rte (void) | |||
558 | if (!iosapic_kmalloc_ok && list_empty(&free_rte_list)) { | 558 | if (!iosapic_kmalloc_ok && list_empty(&free_rte_list)) { |
559 | rte = alloc_bootmem(sizeof(struct iosapic_rte_info) * | 559 | rte = alloc_bootmem(sizeof(struct iosapic_rte_info) * |
560 | NR_PREALLOCATE_RTE_ENTRIES); | 560 | NR_PREALLOCATE_RTE_ENTRIES); |
561 | if (!rte) | ||
562 | return NULL; | ||
563 | for (i = 0; i < NR_PREALLOCATE_RTE_ENTRIES; i++, rte++) | 561 | for (i = 0; i < NR_PREALLOCATE_RTE_ENTRIES; i++, rte++) |
564 | list_add(&rte->rte_list, &free_rte_list); | 562 | list_add(&rte->rte_list, &free_rte_list); |
565 | } | 563 | } |
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index f48a809c686d..4ae15c8c2488 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c | |||
@@ -578,8 +578,6 @@ setup_arch (char **cmdline_p) | |||
578 | cpu_init(); /* initialize the bootstrap CPU */ | 578 | cpu_init(); /* initialize the bootstrap CPU */ |
579 | mmu_context_init(); /* initialize context_id bitmap */ | 579 | mmu_context_init(); /* initialize context_id bitmap */ |
580 | 580 | ||
581 | check_sal_cache_flush(); | ||
582 | |||
583 | #ifdef CONFIG_ACPI | 581 | #ifdef CONFIG_ACPI |
584 | acpi_boot_init(); | 582 | acpi_boot_init(); |
585 | #endif | 583 | #endif |
@@ -607,6 +605,7 @@ setup_arch (char **cmdline_p) | |||
607 | ia64_mca_init(); | 605 | ia64_mca_init(); |
608 | 606 | ||
609 | platform_setup(cmdline_p); | 607 | platform_setup(cmdline_p); |
608 | check_sal_cache_flush(); | ||
610 | paging_init(); | 609 | paging_init(); |
611 | } | 610 | } |
612 | 611 | ||
diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c index 6dd886c5d860..e585f9a2afb9 100644 --- a/arch/ia64/sn/kernel/sn2/sn2_smp.c +++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c | |||
@@ -512,7 +512,7 @@ static ssize_t sn2_ptc_proc_write(struct file *file, const char __user *user, si | |||
512 | int cpu; | 512 | int cpu; |
513 | char optstr[64]; | 513 | char optstr[64]; |
514 | 514 | ||
515 | if (count > sizeof(optstr)) | 515 | if (count == 0 || count > sizeof(optstr)) |
516 | return -EINVAL; | 516 | return -EINVAL; |
517 | if (copy_from_user(optstr, user, count)) | 517 | if (copy_from_user(optstr, user, count)) |
518 | return -EFAULT; | 518 | return -EFAULT; |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 52e18e6d2ba0..e0edaaa6920a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -383,6 +383,7 @@ config VMI | |||
383 | config KVM_CLOCK | 383 | config KVM_CLOCK |
384 | bool "KVM paravirtualized clock" | 384 | bool "KVM paravirtualized clock" |
385 | select PARAVIRT | 385 | select PARAVIRT |
386 | select PARAVIRT_CLOCK | ||
386 | depends on !(X86_VISWS || X86_VOYAGER) | 387 | depends on !(X86_VISWS || X86_VOYAGER) |
387 | help | 388 | help |
388 | Turning on this option will allow you to run a paravirtualized clock | 389 | Turning on this option will allow you to run a paravirtualized clock |
@@ -410,6 +411,10 @@ config PARAVIRT | |||
410 | over full virtualization. However, when run without a hypervisor | 411 | over full virtualization. However, when run without a hypervisor |
411 | the kernel is theoretically slower and slightly larger. | 412 | the kernel is theoretically slower and slightly larger. |
412 | 413 | ||
414 | config PARAVIRT_CLOCK | ||
415 | bool | ||
416 | default n | ||
417 | |||
413 | endif | 418 | endif |
414 | 419 | ||
415 | config MEMTEST_BOOTPARAM | 420 | config MEMTEST_BOOTPARAM |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b4720..77807d4769c9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -82,6 +82,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o | |||
82 | obj-$(CONFIG_KVM_GUEST) += kvm.o | 82 | obj-$(CONFIG_KVM_GUEST) += kvm.o |
83 | obj-$(CONFIG_KVM_CLOCK) += kvmclock.o | 83 | obj-$(CONFIG_KVM_CLOCK) += kvmclock.o |
84 | obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o | 84 | obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o |
85 | obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o | ||
85 | 86 | ||
86 | obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o | 87 | obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o |
87 | 88 | ||
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 08a30986d472..87edf1ceb1df 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -18,6 +18,7 @@ | |||
18 | 18 | ||
19 | #include <linux/clocksource.h> | 19 | #include <linux/clocksource.h> |
20 | #include <linux/kvm_para.h> | 20 | #include <linux/kvm_para.h> |
21 | #include <asm/pvclock.h> | ||
21 | #include <asm/arch_hooks.h> | 22 | #include <asm/arch_hooks.h> |
22 | #include <asm/msr.h> | 23 | #include <asm/msr.h> |
23 | #include <asm/apic.h> | 24 | #include <asm/apic.h> |
@@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg) | |||
36 | early_param("no-kvmclock", parse_no_kvmclock); | 37 | early_param("no-kvmclock", parse_no_kvmclock); |
37 | 38 | ||
38 | /* The hypervisor will put information about time periodically here */ | 39 | /* The hypervisor will put information about time periodically here */ |
39 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); | 40 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); |
40 | #define get_clock(cpu, field) per_cpu(hv_clock, cpu).field | 41 | static struct pvclock_wall_clock wall_clock; |
41 | 42 | ||
42 | static inline u64 kvm_get_delta(u64 last_tsc) | ||
43 | { | ||
44 | int cpu = smp_processor_id(); | ||
45 | u64 delta = native_read_tsc() - last_tsc; | ||
46 | return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE; | ||
47 | } | ||
48 | |||
49 | static struct kvm_wall_clock wall_clock; | ||
50 | static cycle_t kvm_clock_read(void); | ||
51 | /* | 43 | /* |
52 | * The wallclock is the time of day when we booted. Since then, some time may | 44 | * The wallclock is the time of day when we booted. Since then, some time may |
53 | * have elapsed since the hypervisor wrote the data. So we try to account for | 45 | * have elapsed since the hypervisor wrote the data. So we try to account for |
@@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void); | |||
55 | */ | 47 | */ |
56 | static unsigned long kvm_get_wallclock(void) | 48 | static unsigned long kvm_get_wallclock(void) |
57 | { | 49 | { |
58 | u32 wc_sec, wc_nsec; | 50 | struct pvclock_vcpu_time_info *vcpu_time; |
59 | u64 delta; | ||
60 | struct timespec ts; | 51 | struct timespec ts; |
61 | int version, nsec; | ||
62 | int low, high; | 52 | int low, high; |
63 | 53 | ||
64 | low = (int)__pa(&wall_clock); | 54 | low = (int)__pa(&wall_clock); |
65 | high = ((u64)__pa(&wall_clock) >> 32); | 55 | high = ((u64)__pa(&wall_clock) >> 32); |
56 | native_write_msr(MSR_KVM_WALL_CLOCK, low, high); | ||
66 | 57 | ||
67 | delta = kvm_clock_read(); | 58 | vcpu_time = &get_cpu_var(hv_clock); |
59 | pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); | ||
60 | put_cpu_var(hv_clock); | ||
68 | 61 | ||
69 | native_write_msr(MSR_KVM_WALL_CLOCK, low, high); | 62 | return ts.tv_sec; |
70 | do { | ||
71 | version = wall_clock.wc_version; | ||
72 | rmb(); | ||
73 | wc_sec = wall_clock.wc_sec; | ||
74 | wc_nsec = wall_clock.wc_nsec; | ||
75 | rmb(); | ||
76 | } while ((wall_clock.wc_version != version) || (version & 1)); | ||
77 | |||
78 | delta = kvm_clock_read() - delta; | ||
79 | delta += wc_nsec; | ||
80 | nsec = do_div(delta, NSEC_PER_SEC); | ||
81 | set_normalized_timespec(&ts, wc_sec + delta, nsec); | ||
82 | /* | ||
83 | * Of all mechanisms of time adjustment I've tested, this one | ||
84 | * was the champion! | ||
85 | */ | ||
86 | return ts.tv_sec + 1; | ||
87 | } | 63 | } |
88 | 64 | ||
89 | static int kvm_set_wallclock(unsigned long now) | 65 | static int kvm_set_wallclock(unsigned long now) |
90 | { | 66 | { |
91 | return 0; | 67 | return -1; |
92 | } | 68 | } |
93 | 69 | ||
94 | /* | ||
95 | * This is our read_clock function. The host puts an tsc timestamp each time | ||
96 | * it updates a new time. Without the tsc adjustment, we can have a situation | ||
97 | * in which a vcpu starts to run earlier (smaller system_time), but probes | ||
98 | * time later (compared to another vcpu), leading to backwards time | ||
99 | */ | ||
100 | static cycle_t kvm_clock_read(void) | 70 | static cycle_t kvm_clock_read(void) |
101 | { | 71 | { |
102 | u64 last_tsc, now; | 72 | struct pvclock_vcpu_time_info *src; |
103 | int cpu; | 73 | cycle_t ret; |
104 | 74 | ||
105 | preempt_disable(); | 75 | src = &get_cpu_var(hv_clock); |
106 | cpu = smp_processor_id(); | 76 | ret = pvclock_clocksource_read(src); |
107 | 77 | put_cpu_var(hv_clock); | |
108 | last_tsc = get_clock(cpu, tsc_timestamp); | 78 | return ret; |
109 | now = get_clock(cpu, system_time); | ||
110 | |||
111 | now += kvm_get_delta(last_tsc); | ||
112 | preempt_enable(); | ||
113 | |||
114 | return now; | ||
115 | } | 79 | } |
80 | |||
116 | static struct clocksource kvm_clock = { | 81 | static struct clocksource kvm_clock = { |
117 | .name = "kvm-clock", | 82 | .name = "kvm-clock", |
118 | .read = kvm_clock_read, | 83 | .read = kvm_clock_read, |
@@ -123,13 +88,14 @@ static struct clocksource kvm_clock = { | |||
123 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | 88 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
124 | }; | 89 | }; |
125 | 90 | ||
126 | static int kvm_register_clock(void) | 91 | static int kvm_register_clock(char *txt) |
127 | { | 92 | { |
128 | int cpu = smp_processor_id(); | 93 | int cpu = smp_processor_id(); |
129 | int low, high; | 94 | int low, high; |
130 | low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; | 95 | low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; |
131 | high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); | 96 | high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); |
132 | 97 | printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", | |
98 | cpu, high, low, txt); | ||
133 | return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); | 99 | return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); |
134 | } | 100 | } |
135 | 101 | ||
@@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void) | |||
140 | * Now that the first cpu already had this clocksource initialized, | 106 | * Now that the first cpu already had this clocksource initialized, |
141 | * we shouldn't fail. | 107 | * we shouldn't fail. |
142 | */ | 108 | */ |
143 | WARN_ON(kvm_register_clock()); | 109 | WARN_ON(kvm_register_clock("secondary cpu clock")); |
144 | /* ok, done with our trickery, call native */ | 110 | /* ok, done with our trickery, call native */ |
145 | setup_secondary_APIC_clock(); | 111 | setup_secondary_APIC_clock(); |
146 | } | 112 | } |
147 | #endif | 113 | #endif |
148 | 114 | ||
115 | #ifdef CONFIG_SMP | ||
116 | void __init kvm_smp_prepare_boot_cpu(void) | ||
117 | { | ||
118 | WARN_ON(kvm_register_clock("primary cpu clock")); | ||
119 | native_smp_prepare_boot_cpu(); | ||
120 | } | ||
121 | #endif | ||
122 | |||
149 | /* | 123 | /* |
150 | * After the clock is registered, the host will keep writing to the | 124 | * After the clock is registered, the host will keep writing to the |
151 | * registered memory location. If the guest happens to shutdown, this memory | 125 | * registered memory location. If the guest happens to shutdown, this memory |
@@ -174,7 +148,7 @@ void __init kvmclock_init(void) | |||
174 | return; | 148 | return; |
175 | 149 | ||
176 | if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { | 150 | if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { |
177 | if (kvm_register_clock()) | 151 | if (kvm_register_clock("boot clock")) |
178 | return; | 152 | return; |
179 | pv_time_ops.get_wallclock = kvm_get_wallclock; | 153 | pv_time_ops.get_wallclock = kvm_get_wallclock; |
180 | pv_time_ops.set_wallclock = kvm_set_wallclock; | 154 | pv_time_ops.set_wallclock = kvm_set_wallclock; |
@@ -182,6 +156,9 @@ void __init kvmclock_init(void) | |||
182 | #ifdef CONFIG_X86_LOCAL_APIC | 156 | #ifdef CONFIG_X86_LOCAL_APIC |
183 | pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; | 157 | pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; |
184 | #endif | 158 | #endif |
159 | #ifdef CONFIG_SMP | ||
160 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; | ||
161 | #endif | ||
185 | machine_ops.shutdown = kvm_shutdown; | 162 | machine_ops.shutdown = kvm_shutdown; |
186 | #ifdef CONFIG_KEXEC | 163 | #ifdef CONFIG_KEXEC |
187 | machine_ops.crash_shutdown = kvm_crash_shutdown; | 164 | machine_ops.crash_shutdown = kvm_crash_shutdown; |
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c new file mode 100644 index 000000000000..05fbe9a0325a --- /dev/null +++ b/arch/x86/kernel/pvclock.c | |||
@@ -0,0 +1,141 @@ | |||
1 | /* paravirtual clock -- common code used by kvm/xen | ||
2 | |||
3 | This program is free software; you can redistribute it and/or modify | ||
4 | it under the terms of the GNU General Public License as published by | ||
5 | the Free Software Foundation; either version 2 of the License, or | ||
6 | (at your option) any later version. | ||
7 | |||
8 | This program is distributed in the hope that it will be useful, | ||
9 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | GNU General Public License for more details. | ||
12 | |||
13 | You should have received a copy of the GNU General Public License | ||
14 | along with this program; if not, write to the Free Software | ||
15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
16 | */ | ||
17 | |||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/percpu.h> | ||
20 | #include <asm/pvclock.h> | ||
21 | |||
22 | /* | ||
23 | * These are perodically updated | ||
24 | * xen: magic shared_info page | ||
25 | * kvm: gpa registered via msr | ||
26 | * and then copied here. | ||
27 | */ | ||
28 | struct pvclock_shadow_time { | ||
29 | u64 tsc_timestamp; /* TSC at last update of time vals. */ | ||
30 | u64 system_timestamp; /* Time, in nanosecs, since boot. */ | ||
31 | u32 tsc_to_nsec_mul; | ||
32 | int tsc_shift; | ||
33 | u32 version; | ||
34 | }; | ||
35 | |||
36 | /* | ||
37 | * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, | ||
38 | * yielding a 64-bit result. | ||
39 | */ | ||
40 | static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) | ||
41 | { | ||
42 | u64 product; | ||
43 | #ifdef __i386__ | ||
44 | u32 tmp1, tmp2; | ||
45 | #endif | ||
46 | |||
47 | if (shift < 0) | ||
48 | delta >>= -shift; | ||
49 | else | ||
50 | delta <<= shift; | ||
51 | |||
52 | #ifdef __i386__ | ||
53 | __asm__ ( | ||
54 | "mul %5 ; " | ||
55 | "mov %4,%%eax ; " | ||
56 | "mov %%edx,%4 ; " | ||
57 | "mul %5 ; " | ||
58 | "xor %5,%5 ; " | ||
59 | "add %4,%%eax ; " | ||
60 | "adc %5,%%edx ; " | ||
61 | : "=A" (product), "=r" (tmp1), "=r" (tmp2) | ||
62 | : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); | ||
63 | #elif __x86_64__ | ||
64 | __asm__ ( | ||
65 | "mul %%rdx ; shrd $32,%%rdx,%%rax" | ||
66 | : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); | ||
67 | #else | ||
68 | #error implement me! | ||
69 | #endif | ||
70 | |||
71 | return product; | ||
72 | } | ||
73 | |||
74 | static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) | ||
75 | { | ||
76 | u64 delta = native_read_tsc() - shadow->tsc_timestamp; | ||
77 | return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * Reads a consistent set of time-base values from hypervisor, | ||
82 | * into a shadow data area. | ||
83 | */ | ||
84 | static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, | ||
85 | struct pvclock_vcpu_time_info *src) | ||
86 | { | ||
87 | do { | ||
88 | dst->version = src->version; | ||
89 | rmb(); /* fetch version before data */ | ||
90 | dst->tsc_timestamp = src->tsc_timestamp; | ||
91 | dst->system_timestamp = src->system_time; | ||
92 | dst->tsc_to_nsec_mul = src->tsc_to_system_mul; | ||
93 | dst->tsc_shift = src->tsc_shift; | ||
94 | rmb(); /* test version after fetching data */ | ||
95 | } while ((src->version & 1) || (dst->version != src->version)); | ||
96 | |||
97 | return dst->version; | ||
98 | } | ||
99 | |||
100 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) | ||
101 | { | ||
102 | struct pvclock_shadow_time shadow; | ||
103 | unsigned version; | ||
104 | cycle_t ret, offset; | ||
105 | |||
106 | do { | ||
107 | version = pvclock_get_time_values(&shadow, src); | ||
108 | barrier(); | ||
109 | offset = pvclock_get_nsec_offset(&shadow); | ||
110 | ret = shadow.system_timestamp + offset; | ||
111 | barrier(); | ||
112 | } while (version != src->version); | ||
113 | |||
114 | return ret; | ||
115 | } | ||
116 | |||
117 | void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, | ||
118 | struct pvclock_vcpu_time_info *vcpu_time, | ||
119 | struct timespec *ts) | ||
120 | { | ||
121 | u32 version; | ||
122 | u64 delta; | ||
123 | struct timespec now; | ||
124 | |||
125 | /* get wallclock at system boot */ | ||
126 | do { | ||
127 | version = wall_clock->version; | ||
128 | rmb(); /* fetch version before time */ | ||
129 | now.tv_sec = wall_clock->sec; | ||
130 | now.tv_nsec = wall_clock->nsec; | ||
131 | rmb(); /* fetch time before checking version */ | ||
132 | } while ((wall_clock->version & 1) || (version != wall_clock->version)); | ||
133 | |||
134 | delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ | ||
135 | delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; | ||
136 | |||
137 | now.tv_nsec = do_div(delta, NSEC_PER_SEC); | ||
138 | now.tv_sec = delta; | ||
139 | |||
140 | set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); | ||
141 | } | ||
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index f2f5d260874e..3829aa7b663f 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -200,9 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps) | |||
200 | 200 | ||
201 | atomic_inc(&pt->pending); | 201 | atomic_inc(&pt->pending); |
202 | smp_mb__after_atomic_inc(); | 202 | smp_mb__after_atomic_inc(); |
203 | if (vcpu0 && waitqueue_active(&vcpu0->wq)) { | 203 | if (vcpu0) { |
204 | vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 204 | set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); |
205 | wake_up_interruptible(&vcpu0->wq); | 205 | if (waitqueue_active(&vcpu0->wq)) { |
206 | vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
207 | wake_up_interruptible(&vcpu0->wq); | ||
208 | } | ||
206 | } | 209 | } |
207 | 210 | ||
208 | pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); | 211 | pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c297c50eba63..ebc03f5ae162 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic) | |||
940 | wait_queue_head_t *q = &apic->vcpu->wq; | 940 | wait_queue_head_t *q = &apic->vcpu->wq; |
941 | 941 | ||
942 | atomic_inc(&apic->timer.pending); | 942 | atomic_inc(&apic->timer.pending); |
943 | set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); | ||
943 | if (waitqueue_active(q)) { | 944 | if (waitqueue_active(q)) { |
944 | apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 945 | apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
945 | wake_up_interruptible(q); | 946 | wake_up_interruptible(q); |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ee3f53098f0c..7e7c3969f7a2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
640 | rmap_remove(kvm, spte); | 640 | rmap_remove(kvm, spte); |
641 | --kvm->stat.lpages; | 641 | --kvm->stat.lpages; |
642 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | 642 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); |
643 | spte = NULL; | ||
643 | write_protected = 1; | 644 | write_protected = 1; |
644 | } | 645 | } |
645 | spte = rmap_next(kvm, rmapp, spte); | 646 | spte = rmap_next(kvm, rmapp, spte); |
@@ -1082,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1082 | struct kvm_mmu_page *shadow; | 1083 | struct kvm_mmu_page *shadow; |
1083 | 1084 | ||
1084 | spte |= PT_WRITABLE_MASK; | 1085 | spte |= PT_WRITABLE_MASK; |
1085 | if (user_fault) { | ||
1086 | mmu_unshadow(vcpu->kvm, gfn); | ||
1087 | goto unshadowed; | ||
1088 | } | ||
1089 | 1086 | ||
1090 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); | 1087 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); |
1091 | if (shadow || | 1088 | if (shadow || |
@@ -1102,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1102 | } | 1099 | } |
1103 | } | 1100 | } |
1104 | 1101 | ||
1105 | unshadowed: | ||
1106 | |||
1107 | if (pte_access & ACC_WRITE_MASK) | 1102 | if (pte_access & ACC_WRITE_MASK) |
1108 | mark_page_dirty(vcpu->kvm, gfn); | 1103 | mark_page_dirty(vcpu->kvm, gfn); |
1109 | 1104 | ||
@@ -1580,11 +1575,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | |||
1580 | u64 *spte, | 1575 | u64 *spte, |
1581 | const void *new) | 1576 | const void *new) |
1582 | { | 1577 | { |
1583 | if ((sp->role.level != PT_PAGE_TABLE_LEVEL) | 1578 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) { |
1584 | && !vcpu->arch.update_pte.largepage) { | 1579 | if (!vcpu->arch.update_pte.largepage || |
1585 | ++vcpu->kvm->stat.mmu_pde_zapped; | 1580 | sp->role.glevels == PT32_ROOT_LEVEL) { |
1586 | return; | 1581 | ++vcpu->kvm->stat.mmu_pde_zapped; |
1587 | } | 1582 | return; |
1583 | } | ||
1584 | } | ||
1588 | 1585 | ||
1589 | ++vcpu->kvm->stat.mmu_pte_updated; | 1586 | ++vcpu->kvm->stat.mmu_pte_updated; |
1590 | if (sp->role.glevels == PT32_ROOT_LEVEL) | 1587 | if (sp->role.glevels == PT32_ROOT_LEVEL) |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 02efbe75f317..540e95179074 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) | |||
566 | load_transition_efer(vmx); | 566 | load_transition_efer(vmx); |
567 | } | 567 | } |
568 | 568 | ||
569 | static void vmx_load_host_state(struct vcpu_vmx *vmx) | 569 | static void __vmx_load_host_state(struct vcpu_vmx *vmx) |
570 | { | 570 | { |
571 | unsigned long flags; | 571 | unsigned long flags; |
572 | 572 | ||
@@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) | |||
596 | reload_host_efer(vmx); | 596 | reload_host_efer(vmx); |
597 | } | 597 | } |
598 | 598 | ||
599 | static void vmx_load_host_state(struct vcpu_vmx *vmx) | ||
600 | { | ||
601 | preempt_disable(); | ||
602 | __vmx_load_host_state(vmx); | ||
603 | preempt_enable(); | ||
604 | } | ||
605 | |||
599 | /* | 606 | /* |
600 | * Switches to specified vcpu, until a matching vcpu_put(), but assumes | 607 | * Switches to specified vcpu, until a matching vcpu_put(), but assumes |
601 | * vcpu mutex is already taken. | 608 | * vcpu mutex is already taken. |
@@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
654 | 661 | ||
655 | static void vmx_vcpu_put(struct kvm_vcpu *vcpu) | 662 | static void vmx_vcpu_put(struct kvm_vcpu *vcpu) |
656 | { | 663 | { |
657 | vmx_load_host_state(to_vmx(vcpu)); | 664 | __vmx_load_host_state(to_vmx(vcpu)); |
658 | } | 665 | } |
659 | 666 | ||
660 | static void vmx_fpu_activate(struct kvm_vcpu *vcpu) | 667 | static void vmx_fpu_activate(struct kvm_vcpu *vcpu) |
@@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
884 | switch (msr_index) { | 891 | switch (msr_index) { |
885 | #ifdef CONFIG_X86_64 | 892 | #ifdef CONFIG_X86_64 |
886 | case MSR_EFER: | 893 | case MSR_EFER: |
894 | vmx_load_host_state(vmx); | ||
887 | ret = kvm_set_msr_common(vcpu, msr_index, data); | 895 | ret = kvm_set_msr_common(vcpu, msr_index, data); |
888 | if (vmx->host_state.loaded) { | ||
889 | reload_host_efer(vmx); | ||
890 | load_transition_efer(vmx); | ||
891 | } | ||
892 | break; | 896 | break; |
893 | case MSR_FS_BASE: | 897 | case MSR_FS_BASE: |
894 | vmcs_writel(GUEST_FS_BASE, data); | 898 | vmcs_writel(GUEST_FS_BASE, data); |
@@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
910 | guest_write_tsc(data); | 914 | guest_write_tsc(data); |
911 | break; | 915 | break; |
912 | default: | 916 | default: |
917 | vmx_load_host_state(vmx); | ||
913 | msr = find_msr_entry(vmx, msr_index); | 918 | msr = find_msr_entry(vmx, msr_index); |
914 | if (msr) { | 919 | if (msr) { |
915 | msr->data = data; | 920 | msr->data = data; |
916 | if (vmx->host_state.loaded) | ||
917 | load_msrs(vmx->guest_msrs, vmx->save_nmsrs); | ||
918 | break; | 921 | break; |
919 | } | 922 | } |
920 | ret = kvm_set_msr_common(vcpu, msr_index, data); | 923 | ret = kvm_set_msr_common(vcpu, msr_index, data); |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00acf1301a15..63a77caa59f1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) | |||
492 | static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) | 492 | static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) |
493 | { | 493 | { |
494 | static int version; | 494 | static int version; |
495 | struct kvm_wall_clock wc; | 495 | struct pvclock_wall_clock wc; |
496 | struct timespec wc_ts; | 496 | struct timespec now, sys, boot; |
497 | 497 | ||
498 | if (!wall_clock) | 498 | if (!wall_clock) |
499 | return; | 499 | return; |
@@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) | |||
502 | 502 | ||
503 | kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); | 503 | kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); |
504 | 504 | ||
505 | wc_ts = current_kernel_time(); | 505 | /* |
506 | wc.wc_sec = wc_ts.tv_sec; | 506 | * The guest calculates current wall clock time by adding |
507 | wc.wc_nsec = wc_ts.tv_nsec; | 507 | * system time (updated by kvm_write_guest_time below) to the |
508 | wc.wc_version = version; | 508 | * wall clock specified here. guest system time equals host |
509 | * system time for us, thus we must fill in host boot time here. | ||
510 | */ | ||
511 | now = current_kernel_time(); | ||
512 | ktime_get_ts(&sys); | ||
513 | boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); | ||
514 | |||
515 | wc.sec = boot.tv_sec; | ||
516 | wc.nsec = boot.tv_nsec; | ||
517 | wc.version = version; | ||
509 | 518 | ||
510 | kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); | 519 | kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); |
511 | 520 | ||
@@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) | |||
513 | kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); | 522 | kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); |
514 | } | 523 | } |
515 | 524 | ||
525 | static uint32_t div_frac(uint32_t dividend, uint32_t divisor) | ||
526 | { | ||
527 | uint32_t quotient, remainder; | ||
528 | |||
529 | /* Don't try to replace with do_div(), this one calculates | ||
530 | * "(dividend << 32) / divisor" */ | ||
531 | __asm__ ( "divl %4" | ||
532 | : "=a" (quotient), "=d" (remainder) | ||
533 | : "0" (0), "1" (dividend), "r" (divisor) ); | ||
534 | return quotient; | ||
535 | } | ||
536 | |||
537 | static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) | ||
538 | { | ||
539 | uint64_t nsecs = 1000000000LL; | ||
540 | int32_t shift = 0; | ||
541 | uint64_t tps64; | ||
542 | uint32_t tps32; | ||
543 | |||
544 | tps64 = tsc_khz * 1000LL; | ||
545 | while (tps64 > nsecs*2) { | ||
546 | tps64 >>= 1; | ||
547 | shift--; | ||
548 | } | ||
549 | |||
550 | tps32 = (uint32_t)tps64; | ||
551 | while (tps32 <= (uint32_t)nsecs) { | ||
552 | tps32 <<= 1; | ||
553 | shift++; | ||
554 | } | ||
555 | |||
556 | hv_clock->tsc_shift = shift; | ||
557 | hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); | ||
558 | |||
559 | pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", | ||
560 | __FUNCTION__, tsc_khz, hv_clock->tsc_shift, | ||
561 | hv_clock->tsc_to_system_mul); | ||
562 | } | ||
563 | |||
516 | static void kvm_write_guest_time(struct kvm_vcpu *v) | 564 | static void kvm_write_guest_time(struct kvm_vcpu *v) |
517 | { | 565 | { |
518 | struct timespec ts; | 566 | struct timespec ts; |
@@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) | |||
523 | if ((!vcpu->time_page)) | 571 | if ((!vcpu->time_page)) |
524 | return; | 572 | return; |
525 | 573 | ||
574 | if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) { | ||
575 | kvm_set_time_scale(tsc_khz, &vcpu->hv_clock); | ||
576 | vcpu->hv_clock_tsc_khz = tsc_khz; | ||
577 | } | ||
578 | |||
526 | /* Keep irq disabled to prevent changes to the clock */ | 579 | /* Keep irq disabled to prevent changes to the clock */ |
527 | local_irq_save(flags); | 580 | local_irq_save(flags); |
528 | kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, | 581 | kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, |
@@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) | |||
537 | /* | 590 | /* |
538 | * The interface expects us to write an even number signaling that the | 591 | * The interface expects us to write an even number signaling that the |
539 | * update is finished. Since the guest won't see the intermediate | 592 | * update is finished. Since the guest won't see the intermediate |
540 | * state, we just write "2" at the end | 593 | * state, we just increase by 2 at the end. |
541 | */ | 594 | */ |
542 | vcpu->hv_clock.version = 2; | 595 | vcpu->hv_clock.version += 2; |
543 | 596 | ||
544 | shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); | 597 | shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); |
545 | 598 | ||
546 | memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, | 599 | memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, |
547 | sizeof(vcpu->hv_clock)); | 600 | sizeof(vcpu->hv_clock)); |
548 | 601 | ||
549 | kunmap_atomic(shared_kaddr, KM_USER0); | 602 | kunmap_atomic(shared_kaddr, KM_USER0); |
550 | 603 | ||
@@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
599 | /* ...but clean it before doing the actual write */ | 652 | /* ...but clean it before doing the actual write */ |
600 | vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); | 653 | vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); |
601 | 654 | ||
602 | vcpu->arch.hv_clock.tsc_to_system_mul = | ||
603 | clocksource_khz2mult(tsc_khz, 22); | ||
604 | vcpu->arch.hv_clock.tsc_shift = 22; | ||
605 | |||
606 | down_read(¤t->mm->mmap_sem); | 655 | down_read(¤t->mm->mmap_sem); |
607 | vcpu->arch.time_page = | 656 | vcpu->arch.time_page = |
608 | gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); | 657 | gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); |
@@ -2759,6 +2808,8 @@ again: | |||
2759 | if (vcpu->requests) { | 2808 | if (vcpu->requests) { |
2760 | if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) | 2809 | if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) |
2761 | __kvm_migrate_timers(vcpu); | 2810 | __kvm_migrate_timers(vcpu); |
2811 | if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) | ||
2812 | kvm_x86_ops->tlb_flush(vcpu); | ||
2762 | if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, | 2813 | if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, |
2763 | &vcpu->requests)) { | 2814 | &vcpu->requests)) { |
2764 | kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; | 2815 | kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; |
@@ -2772,6 +2823,7 @@ again: | |||
2772 | } | 2823 | } |
2773 | } | 2824 | } |
2774 | 2825 | ||
2826 | clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); | ||
2775 | kvm_inject_pending_timer_irqs(vcpu); | 2827 | kvm_inject_pending_timer_irqs(vcpu); |
2776 | 2828 | ||
2777 | preempt_disable(); | 2829 | preempt_disable(); |
@@ -2781,21 +2833,13 @@ again: | |||
2781 | 2833 | ||
2782 | local_irq_disable(); | 2834 | local_irq_disable(); |
2783 | 2835 | ||
2784 | if (need_resched()) { | 2836 | if (vcpu->requests || need_resched()) { |
2785 | local_irq_enable(); | 2837 | local_irq_enable(); |
2786 | preempt_enable(); | 2838 | preempt_enable(); |
2787 | r = 1; | 2839 | r = 1; |
2788 | goto out; | 2840 | goto out; |
2789 | } | 2841 | } |
2790 | 2842 | ||
2791 | if (vcpu->requests) | ||
2792 | if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) { | ||
2793 | local_irq_enable(); | ||
2794 | preempt_enable(); | ||
2795 | r = 1; | ||
2796 | goto out; | ||
2797 | } | ||
2798 | |||
2799 | if (signal_pending(current)) { | 2843 | if (signal_pending(current)) { |
2800 | local_irq_enable(); | 2844 | local_irq_enable(); |
2801 | preempt_enable(); | 2845 | preempt_enable(); |
@@ -2825,9 +2869,6 @@ again: | |||
2825 | 2869 | ||
2826 | kvm_guest_enter(); | 2870 | kvm_guest_enter(); |
2827 | 2871 | ||
2828 | if (vcpu->requests) | ||
2829 | if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) | ||
2830 | kvm_x86_ops->tlb_flush(vcpu); | ||
2831 | 2872 | ||
2832 | KVMTRACE_0D(VMENTRY, vcpu, entryexit); | 2873 | KVMTRACE_0D(VMENTRY, vcpu, entryexit); |
2833 | kvm_x86_ops->run(vcpu, kvm_run); | 2874 | kvm_x86_ops->run(vcpu, kvm_run); |
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 2e641be2737e..6c388e593bc8 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig | |||
@@ -5,8 +5,9 @@ | |||
5 | config XEN | 5 | config XEN |
6 | bool "Xen guest support" | 6 | bool "Xen guest support" |
7 | select PARAVIRT | 7 | select PARAVIRT |
8 | select PARAVIRT_CLOCK | ||
8 | depends on X86_32 | 9 | depends on X86_32 |
9 | depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) | 10 | depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) |
10 | help | 11 | help |
11 | This is the Linux Xen port. Enabling this will allow the | 12 | This is the Linux Xen port. Enabling this will allow the |
12 | kernel to boot in a paravirtualized environment under the | 13 | kernel to boot in a paravirtualized environment under the |
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c8a56e457d61..f09c1c69c37a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) | |||
785 | static __init void xen_pagetable_setup_start(pgd_t *base) | 785 | static __init void xen_pagetable_setup_start(pgd_t *base) |
786 | { | 786 | { |
787 | pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; | 787 | pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; |
788 | int i; | ||
788 | 789 | ||
789 | /* special set_pte for pagetable initialization */ | 790 | /* special set_pte for pagetable initialization */ |
790 | pv_mmu_ops.set_pte = xen_set_pte_init; | 791 | pv_mmu_ops.set_pte = xen_set_pte_init; |
791 | 792 | ||
792 | init_mm.pgd = base; | 793 | init_mm.pgd = base; |
793 | /* | 794 | /* |
794 | * copy top-level of Xen-supplied pagetable into place. For | 795 | * copy top-level of Xen-supplied pagetable into place. This |
795 | * !PAE we can use this as-is, but for PAE it is a stand-in | 796 | * is a stand-in while we copy the pmd pages. |
796 | * while we copy the pmd pages. | ||
797 | */ | 797 | */ |
798 | memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); | 798 | memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); |
799 | 799 | ||
800 | if (PTRS_PER_PMD > 1) { | 800 | /* |
801 | int i; | 801 | * For PAE, need to allocate new pmds, rather than |
802 | /* | 802 | * share Xen's, since Xen doesn't like pmd's being |
803 | * For PAE, need to allocate new pmds, rather than | 803 | * shared between address spaces. |
804 | * share Xen's, since Xen doesn't like pmd's being | 804 | */ |
805 | * shared between address spaces. | 805 | for (i = 0; i < PTRS_PER_PGD; i++) { |
806 | */ | 806 | if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { |
807 | for (i = 0; i < PTRS_PER_PGD; i++) { | 807 | pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); |
808 | if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { | ||
809 | pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); | ||
810 | 808 | ||
811 | memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), | 809 | memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), |
812 | PAGE_SIZE); | 810 | PAGE_SIZE); |
813 | 811 | ||
814 | make_lowmem_page_readonly(pmd); | 812 | make_lowmem_page_readonly(pmd); |
815 | 813 | ||
816 | set_pgd(&base[i], __pgd(1 + __pa(pmd))); | 814 | set_pgd(&base[i], __pgd(1 + __pa(pmd))); |
817 | } else | 815 | } else |
818 | pgd_clear(&base[i]); | 816 | pgd_clear(&base[i]); |
819 | } | ||
820 | } | 817 | } |
821 | 818 | ||
822 | /* make sure zero_page is mapped RO so we can use it in pagetables */ | 819 | /* make sure zero_page is mapped RO so we can use it in pagetables */ |
@@ -873,17 +870,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) | |||
873 | 870 | ||
874 | /* Actually pin the pagetable down, but we can't set PG_pinned | 871 | /* Actually pin the pagetable down, but we can't set PG_pinned |
875 | yet because the page structures don't exist yet. */ | 872 | yet because the page structures don't exist yet. */ |
876 | { | 873 | pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); |
877 | unsigned level; | ||
878 | |||
879 | #ifdef CONFIG_X86_PAE | ||
880 | level = MMUEXT_PIN_L3_TABLE; | ||
881 | #else | ||
882 | level = MMUEXT_PIN_L2_TABLE; | ||
883 | #endif | ||
884 | |||
885 | pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); | ||
886 | } | ||
887 | } | 874 | } |
888 | 875 | ||
889 | /* This is called once we have the cpu_possible_map */ | 876 | /* This is called once we have the cpu_possible_map */ |
@@ -1093,7 +1080,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { | |||
1093 | .make_pte = xen_make_pte, | 1080 | .make_pte = xen_make_pte, |
1094 | .make_pgd = xen_make_pgd, | 1081 | .make_pgd = xen_make_pgd, |
1095 | 1082 | ||
1096 | #ifdef CONFIG_X86_PAE | ||
1097 | .set_pte_atomic = xen_set_pte_atomic, | 1083 | .set_pte_atomic = xen_set_pte_atomic, |
1098 | .set_pte_present = xen_set_pte_at, | 1084 | .set_pte_present = xen_set_pte_at, |
1099 | .set_pud = xen_set_pud, | 1085 | .set_pud = xen_set_pud, |
@@ -1102,7 +1088,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { | |||
1102 | 1088 | ||
1103 | .make_pmd = xen_make_pmd, | 1089 | .make_pmd = xen_make_pmd, |
1104 | .pmd_val = xen_pmd_val, | 1090 | .pmd_val = xen_pmd_val, |
1105 | #endif /* PAE */ | ||
1106 | 1091 | ||
1107 | .activate_mm = xen_activate_mm, | 1092 | .activate_mm = xen_activate_mm, |
1108 | .dup_mmap = xen_dup_mmap, | 1093 | .dup_mmap = xen_dup_mmap, |
@@ -1228,6 +1213,11 @@ asmlinkage void __init xen_start_kernel(void) | |||
1228 | if (xen_feature(XENFEAT_supervisor_mode_kernel)) | 1213 | if (xen_feature(XENFEAT_supervisor_mode_kernel)) |
1229 | pv_info.kernel_rpl = 0; | 1214 | pv_info.kernel_rpl = 0; |
1230 | 1215 | ||
1216 | /* Prevent unwanted bits from being set in PTEs. */ | ||
1217 | __supported_pte_mask &= ~_PAGE_GLOBAL; | ||
1218 | if (!is_initial_xendomain()) | ||
1219 | __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); | ||
1220 | |||
1231 | /* set the limit of our address space */ | 1221 | /* set the limit of our address space */ |
1232 | xen_reserve_top(); | 1222 | xen_reserve_top(); |
1233 | 1223 | ||
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3525ef523a74..df40bf74ea75 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -179,50 +179,56 @@ out: | |||
179 | preempt_enable(); | 179 | preempt_enable(); |
180 | } | 180 | } |
181 | 181 | ||
182 | pteval_t xen_pte_val(pte_t pte) | 182 | /* Assume pteval_t is equivalent to all the other *val_t types. */ |
183 | static pteval_t pte_mfn_to_pfn(pteval_t val) | ||
184 | { | ||
185 | if (val & _PAGE_PRESENT) { | ||
186 | unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; | ||
187 | pteval_t flags = val & ~PTE_MASK; | ||
188 | val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; | ||
189 | } | ||
190 | |||
191 | return val; | ||
192 | } | ||
193 | |||
194 | static pteval_t pte_pfn_to_mfn(pteval_t val) | ||
183 | { | 195 | { |
184 | pteval_t ret = pte.pte; | 196 | if (val & _PAGE_PRESENT) { |
197 | unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; | ||
198 | pteval_t flags = val & ~PTE_MASK; | ||
199 | val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; | ||
200 | } | ||
185 | 201 | ||
186 | if (ret & _PAGE_PRESENT) | 202 | return val; |
187 | ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; | 203 | } |
188 | 204 | ||
189 | return ret; | 205 | pteval_t xen_pte_val(pte_t pte) |
206 | { | ||
207 | return pte_mfn_to_pfn(pte.pte); | ||
190 | } | 208 | } |
191 | 209 | ||
192 | pgdval_t xen_pgd_val(pgd_t pgd) | 210 | pgdval_t xen_pgd_val(pgd_t pgd) |
193 | { | 211 | { |
194 | pgdval_t ret = pgd.pgd; | 212 | return pte_mfn_to_pfn(pgd.pgd); |
195 | if (ret & _PAGE_PRESENT) | ||
196 | ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; | ||
197 | return ret; | ||
198 | } | 213 | } |
199 | 214 | ||
200 | pte_t xen_make_pte(pteval_t pte) | 215 | pte_t xen_make_pte(pteval_t pte) |
201 | { | 216 | { |
202 | if (pte & _PAGE_PRESENT) { | 217 | pte = pte_pfn_to_mfn(pte); |
203 | pte = phys_to_machine(XPADDR(pte)).maddr; | 218 | return native_make_pte(pte); |
204 | pte &= ~(_PAGE_PCD | _PAGE_PWT); | ||
205 | } | ||
206 | |||
207 | return (pte_t){ .pte = pte }; | ||
208 | } | 219 | } |
209 | 220 | ||
210 | pgd_t xen_make_pgd(pgdval_t pgd) | 221 | pgd_t xen_make_pgd(pgdval_t pgd) |
211 | { | 222 | { |
212 | if (pgd & _PAGE_PRESENT) | 223 | pgd = pte_pfn_to_mfn(pgd); |
213 | pgd = phys_to_machine(XPADDR(pgd)).maddr; | 224 | return native_make_pgd(pgd); |
214 | |||
215 | return (pgd_t){ pgd }; | ||
216 | } | 225 | } |
217 | 226 | ||
218 | pmdval_t xen_pmd_val(pmd_t pmd) | 227 | pmdval_t xen_pmd_val(pmd_t pmd) |
219 | { | 228 | { |
220 | pmdval_t ret = native_pmd_val(pmd); | 229 | return pte_mfn_to_pfn(pmd.pmd); |
221 | if (ret & _PAGE_PRESENT) | ||
222 | ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; | ||
223 | return ret; | ||
224 | } | 230 | } |
225 | #ifdef CONFIG_X86_PAE | 231 | |
226 | void xen_set_pud(pud_t *ptr, pud_t val) | 232 | void xen_set_pud(pud_t *ptr, pud_t val) |
227 | { | 233 | { |
228 | struct multicall_space mcs; | 234 | struct multicall_space mcs; |
@@ -267,17 +273,9 @@ void xen_pmd_clear(pmd_t *pmdp) | |||
267 | 273 | ||
268 | pmd_t xen_make_pmd(pmdval_t pmd) | 274 | pmd_t xen_make_pmd(pmdval_t pmd) |
269 | { | 275 | { |
270 | if (pmd & _PAGE_PRESENT) | 276 | pmd = pte_pfn_to_mfn(pmd); |
271 | pmd = phys_to_machine(XPADDR(pmd)).maddr; | ||
272 | |||
273 | return native_make_pmd(pmd); | 277 | return native_make_pmd(pmd); |
274 | } | 278 | } |
275 | #else /* !PAE */ | ||
276 | void xen_set_pte(pte_t *ptep, pte_t pte) | ||
277 | { | ||
278 | *ptep = pte; | ||
279 | } | ||
280 | #endif /* CONFIG_X86_PAE */ | ||
281 | 279 | ||
282 | /* | 280 | /* |
283 | (Yet another) pagetable walker. This one is intended for pinning a | 281 | (Yet another) pagetable walker. This one is intended for pinning a |
@@ -430,8 +428,6 @@ static int pin_page(struct page *page, enum pt_level level) | |||
430 | read-only, and can be pinned. */ | 428 | read-only, and can be pinned. */ |
431 | void xen_pgd_pin(pgd_t *pgd) | 429 | void xen_pgd_pin(pgd_t *pgd) |
432 | { | 430 | { |
433 | unsigned level; | ||
434 | |||
435 | xen_mc_batch(); | 431 | xen_mc_batch(); |
436 | 432 | ||
437 | if (pgd_walk(pgd, pin_page, TASK_SIZE)) { | 433 | if (pgd_walk(pgd, pin_page, TASK_SIZE)) { |
@@ -441,14 +437,7 @@ void xen_pgd_pin(pgd_t *pgd) | |||
441 | xen_mc_batch(); | 437 | xen_mc_batch(); |
442 | } | 438 | } |
443 | 439 | ||
444 | #ifdef CONFIG_X86_PAE | 440 | xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); |
445 | level = MMUEXT_PIN_L3_TABLE; | ||
446 | #else | ||
447 | level = MMUEXT_PIN_L2_TABLE; | ||
448 | #endif | ||
449 | |||
450 | xen_do_pin(level, PFN_DOWN(__pa(pgd))); | ||
451 | |||
452 | xen_mc_issue(0); | 441 | xen_mc_issue(0); |
453 | } | 442 | } |
454 | 443 | ||
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index b5e189b1519d..5fe961caffd4 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h | |||
@@ -37,14 +37,13 @@ void xen_exit_mmap(struct mm_struct *mm); | |||
37 | void xen_pgd_pin(pgd_t *pgd); | 37 | void xen_pgd_pin(pgd_t *pgd); |
38 | //void xen_pgd_unpin(pgd_t *pgd); | 38 | //void xen_pgd_unpin(pgd_t *pgd); |
39 | 39 | ||
40 | #ifdef CONFIG_X86_PAE | 40 | pteval_t xen_pte_val(pte_t); |
41 | unsigned long long xen_pte_val(pte_t); | 41 | pmdval_t xen_pmd_val(pmd_t); |
42 | unsigned long long xen_pmd_val(pmd_t); | 42 | pgdval_t xen_pgd_val(pgd_t); |
43 | unsigned long long xen_pgd_val(pgd_t); | ||
44 | 43 | ||
45 | pte_t xen_make_pte(unsigned long long); | 44 | pte_t xen_make_pte(pteval_t); |
46 | pmd_t xen_make_pmd(unsigned long long); | 45 | pmd_t xen_make_pmd(pmdval_t); |
47 | pgd_t xen_make_pgd(unsigned long long); | 46 | pgd_t xen_make_pgd(pgdval_t); |
48 | 47 | ||
49 | void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | 48 | void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, |
50 | pte_t *ptep, pte_t pteval); | 49 | pte_t *ptep, pte_t pteval); |
@@ -53,15 +52,4 @@ void xen_set_pud(pud_t *ptr, pud_t val); | |||
53 | void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); | 52 | void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); |
54 | void xen_pmd_clear(pmd_t *pmdp); | 53 | void xen_pmd_clear(pmd_t *pmdp); |
55 | 54 | ||
56 | |||
57 | #else | ||
58 | unsigned long xen_pte_val(pte_t); | ||
59 | unsigned long xen_pmd_val(pmd_t); | ||
60 | unsigned long xen_pgd_val(pgd_t); | ||
61 | |||
62 | pte_t xen_make_pte(unsigned long); | ||
63 | pmd_t xen_make_pmd(unsigned long); | ||
64 | pgd_t xen_make_pgd(unsigned long); | ||
65 | #endif | ||
66 | |||
67 | #endif /* _XEN_MMU_H */ | 55 | #endif /* _XEN_MMU_H */ |
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 52b2e3856980..41e217503c96 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/kernel_stat.h> | 14 | #include <linux/kernel_stat.h> |
15 | #include <linux/math64.h> | 15 | #include <linux/math64.h> |
16 | 16 | ||
17 | #include <asm/pvclock.h> | ||
17 | #include <asm/xen/hypervisor.h> | 18 | #include <asm/xen/hypervisor.h> |
18 | #include <asm/xen/hypercall.h> | 19 | #include <asm/xen/hypercall.h> |
19 | 20 | ||
@@ -31,17 +32,6 @@ | |||
31 | 32 | ||
32 | static cycle_t xen_clocksource_read(void); | 33 | static cycle_t xen_clocksource_read(void); |
33 | 34 | ||
34 | /* These are perodically updated in shared_info, and then copied here. */ | ||
35 | struct shadow_time_info { | ||
36 | u64 tsc_timestamp; /* TSC at last update of time vals. */ | ||
37 | u64 system_timestamp; /* Time, in nanosecs, since boot. */ | ||
38 | u32 tsc_to_nsec_mul; | ||
39 | int tsc_shift; | ||
40 | u32 version; | ||
41 | }; | ||
42 | |||
43 | static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); | ||
44 | |||
45 | /* runstate info updated by Xen */ | 35 | /* runstate info updated by Xen */ |
46 | static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); | 36 | static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); |
47 | 37 | ||
@@ -211,7 +201,7 @@ unsigned long long xen_sched_clock(void) | |||
211 | unsigned long xen_cpu_khz(void) | 201 | unsigned long xen_cpu_khz(void) |
212 | { | 202 | { |
213 | u64 xen_khz = 1000000ULL << 32; | 203 | u64 xen_khz = 1000000ULL << 32; |
214 | const struct vcpu_time_info *info = | 204 | const struct pvclock_vcpu_time_info *info = |
215 | &HYPERVISOR_shared_info->vcpu_info[0].time; | 205 | &HYPERVISOR_shared_info->vcpu_info[0].time; |
216 | 206 | ||
217 | do_div(xen_khz, info->tsc_to_system_mul); | 207 | do_div(xen_khz, info->tsc_to_system_mul); |
@@ -223,121 +213,26 @@ unsigned long xen_cpu_khz(void) | |||
223 | return xen_khz; | 213 | return xen_khz; |
224 | } | 214 | } |
225 | 215 | ||
226 | /* | ||
227 | * Reads a consistent set of time-base values from Xen, into a shadow data | ||
228 | * area. | ||
229 | */ | ||
230 | static unsigned get_time_values_from_xen(void) | ||
231 | { | ||
232 | struct vcpu_time_info *src; | ||
233 | struct shadow_time_info *dst; | ||
234 | |||
235 | /* src is shared memory with the hypervisor, so we need to | ||
236 | make sure we get a consistent snapshot, even in the face of | ||
237 | being preempted. */ | ||
238 | src = &__get_cpu_var(xen_vcpu)->time; | ||
239 | dst = &__get_cpu_var(shadow_time); | ||
240 | |||
241 | do { | ||
242 | dst->version = src->version; | ||
243 | rmb(); /* fetch version before data */ | ||
244 | dst->tsc_timestamp = src->tsc_timestamp; | ||
245 | dst->system_timestamp = src->system_time; | ||
246 | dst->tsc_to_nsec_mul = src->tsc_to_system_mul; | ||
247 | dst->tsc_shift = src->tsc_shift; | ||
248 | rmb(); /* test version after fetching data */ | ||
249 | } while ((src->version & 1) | (dst->version ^ src->version)); | ||
250 | |||
251 | return dst->version; | ||
252 | } | ||
253 | |||
254 | /* | ||
255 | * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, | ||
256 | * yielding a 64-bit result. | ||
257 | */ | ||
258 | static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) | ||
259 | { | ||
260 | u64 product; | ||
261 | #ifdef __i386__ | ||
262 | u32 tmp1, tmp2; | ||
263 | #endif | ||
264 | |||
265 | if (shift < 0) | ||
266 | delta >>= -shift; | ||
267 | else | ||
268 | delta <<= shift; | ||
269 | |||
270 | #ifdef __i386__ | ||
271 | __asm__ ( | ||
272 | "mul %5 ; " | ||
273 | "mov %4,%%eax ; " | ||
274 | "mov %%edx,%4 ; " | ||
275 | "mul %5 ; " | ||
276 | "xor %5,%5 ; " | ||
277 | "add %4,%%eax ; " | ||
278 | "adc %5,%%edx ; " | ||
279 | : "=A" (product), "=r" (tmp1), "=r" (tmp2) | ||
280 | : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); | ||
281 | #elif __x86_64__ | ||
282 | __asm__ ( | ||
283 | "mul %%rdx ; shrd $32,%%rdx,%%rax" | ||
284 | : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); | ||
285 | #else | ||
286 | #error implement me! | ||
287 | #endif | ||
288 | |||
289 | return product; | ||
290 | } | ||
291 | |||
292 | static u64 get_nsec_offset(struct shadow_time_info *shadow) | ||
293 | { | ||
294 | u64 now, delta; | ||
295 | now = native_read_tsc(); | ||
296 | delta = now - shadow->tsc_timestamp; | ||
297 | return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); | ||
298 | } | ||
299 | |||
300 | static cycle_t xen_clocksource_read(void) | 216 | static cycle_t xen_clocksource_read(void) |
301 | { | 217 | { |
302 | struct shadow_time_info *shadow = &get_cpu_var(shadow_time); | 218 | struct pvclock_vcpu_time_info *src; |
303 | cycle_t ret; | 219 | cycle_t ret; |
304 | unsigned version; | ||
305 | |||
306 | do { | ||
307 | version = get_time_values_from_xen(); | ||
308 | barrier(); | ||
309 | ret = shadow->system_timestamp + get_nsec_offset(shadow); | ||
310 | barrier(); | ||
311 | } while (version != __get_cpu_var(xen_vcpu)->time.version); | ||
312 | |||
313 | put_cpu_var(shadow_time); | ||
314 | 220 | ||
221 | src = &get_cpu_var(xen_vcpu)->time; | ||
222 | ret = pvclock_clocksource_read(src); | ||
223 | put_cpu_var(xen_vcpu); | ||
315 | return ret; | 224 | return ret; |
316 | } | 225 | } |
317 | 226 | ||
318 | static void xen_read_wallclock(struct timespec *ts) | 227 | static void xen_read_wallclock(struct timespec *ts) |
319 | { | 228 | { |
320 | const struct shared_info *s = HYPERVISOR_shared_info; | 229 | struct shared_info *s = HYPERVISOR_shared_info; |
321 | u32 version; | 230 | struct pvclock_wall_clock *wall_clock = &(s->wc); |
322 | u64 delta; | 231 | struct pvclock_vcpu_time_info *vcpu_time; |
323 | struct timespec now; | ||
324 | |||
325 | /* get wallclock at system boot */ | ||
326 | do { | ||
327 | version = s->wc_version; | ||
328 | rmb(); /* fetch version before time */ | ||
329 | now.tv_sec = s->wc_sec; | ||
330 | now.tv_nsec = s->wc_nsec; | ||
331 | rmb(); /* fetch time before checking version */ | ||
332 | } while ((s->wc_version & 1) | (version ^ s->wc_version)); | ||
333 | 232 | ||
334 | delta = xen_clocksource_read(); /* time since system boot */ | 233 | vcpu_time = &get_cpu_var(xen_vcpu)->time; |
335 | delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; | 234 | pvclock_read_wallclock(wall_clock, vcpu_time, ts); |
336 | 235 | put_cpu_var(xen_vcpu); | |
337 | now.tv_nsec = do_div(delta, NSEC_PER_SEC); | ||
338 | now.tv_sec = delta; | ||
339 | |||
340 | set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); | ||
341 | } | 236 | } |
342 | 237 | ||
343 | unsigned long xen_get_wallclock(void) | 238 | unsigned long xen_get_wallclock(void) |
@@ -345,7 +240,6 @@ unsigned long xen_get_wallclock(void) | |||
345 | struct timespec ts; | 240 | struct timespec ts; |
346 | 241 | ||
347 | xen_read_wallclock(&ts); | 242 | xen_read_wallclock(&ts); |
348 | |||
349 | return ts.tv_sec; | 243 | return ts.tv_sec; |
350 | } | 244 | } |
351 | 245 | ||
@@ -569,8 +463,6 @@ __init void xen_time_init(void) | |||
569 | { | 463 | { |
570 | int cpu = smp_processor_id(); | 464 | int cpu = smp_processor_id(); |
571 | 465 | ||
572 | get_time_values_from_xen(); | ||
573 | |||
574 | clocksource_register(&xen_clocksource); | 466 | clocksource_register(&xen_clocksource); |
575 | 467 | ||
576 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { | 468 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { |
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 288d587ce73c..6ec3b4f7719b 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S | |||
@@ -17,7 +17,7 @@ ENTRY(startup_xen) | |||
17 | 17 | ||
18 | __FINIT | 18 | __FINIT |
19 | 19 | ||
20 | .pushsection .bss.page_aligned | 20 | .pushsection .text |
21 | .align PAGE_SIZE_asm | 21 | .align PAGE_SIZE_asm |
22 | ENTRY(hypercall_page) | 22 | ENTRY(hypercall_page) |
23 | .skip 0x1000 | 23 | .skip 0x1000 |
@@ -30,11 +30,7 @@ ENTRY(hypercall_page) | |||
30 | ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) | 30 | ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) |
31 | ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) | 31 | ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) |
32 | ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") | 32 | ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") |
33 | #ifdef CONFIG_X86_PAE | ||
34 | ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") | 33 | ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") |
35 | #else | ||
36 | ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") | ||
37 | #endif | ||
38 | ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") | 34 | ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") |
39 | 35 | ||
40 | #endif /*CONFIG_XEN */ | 36 | #endif /*CONFIG_XEN */ |
diff --git a/drivers/char/drm/i915_drv.c b/drivers/char/drm/i915_drv.c index e8f3d682e3b1..93aed1c38bd2 100644 --- a/drivers/char/drm/i915_drv.c +++ b/drivers/char/drm/i915_drv.c | |||
@@ -389,6 +389,7 @@ static int i915_resume(struct drm_device *dev) | |||
389 | pci_restore_state(dev->pdev); | 389 | pci_restore_state(dev->pdev); |
390 | if (pci_enable_device(dev->pdev)) | 390 | if (pci_enable_device(dev->pdev)) |
391 | return -1; | 391 | return -1; |
392 | pci_set_master(dev->pdev); | ||
392 | 393 | ||
393 | pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB); | 394 | pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB); |
394 | 395 | ||
diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c index b1a757a5ee27..8f81139d6194 100644 --- a/drivers/char/tty_ioctl.c +++ b/drivers/char/tty_ioctl.c | |||
@@ -981,16 +981,9 @@ EXPORT_SYMBOL_GPL(tty_perform_flush); | |||
981 | int n_tty_ioctl(struct tty_struct *tty, struct file *file, | 981 | int n_tty_ioctl(struct tty_struct *tty, struct file *file, |
982 | unsigned int cmd, unsigned long arg) | 982 | unsigned int cmd, unsigned long arg) |
983 | { | 983 | { |
984 | struct tty_struct *real_tty; | ||
985 | unsigned long flags; | 984 | unsigned long flags; |
986 | int retval; | 985 | int retval; |
987 | 986 | ||
988 | if (tty->driver->type == TTY_DRIVER_TYPE_PTY && | ||
989 | tty->driver->subtype == PTY_TYPE_MASTER) | ||
990 | real_tty = tty->link; | ||
991 | else | ||
992 | real_tty = tty; | ||
993 | |||
994 | switch (cmd) { | 987 | switch (cmd) { |
995 | case TCXONC: | 988 | case TCXONC: |
996 | retval = tty_check_change(tty); | 989 | retval = tty_check_change(tty); |
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index b224079d4e1f..d5862e5d99a0 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c | |||
@@ -109,7 +109,11 @@ static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_m | |||
109 | { | 109 | { |
110 | struct page *page; | 110 | struct page *page; |
111 | 111 | ||
112 | page = alloc_pages(gfp_mask, order); | 112 | /* |
113 | * Use __GFP_ZERO because buggy firmware assumes ICM pages are | ||
114 | * cleared, and subtle failures are seen if they aren't. | ||
115 | */ | ||
116 | page = alloc_pages(gfp_mask | __GFP_ZERO, order); | ||
113 | if (!page) | 117 | if (!page) |
114 | return -ENOMEM; | 118 | return -ENOMEM; |
115 | 119 | ||
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 5126d5d9ea0e..2e554a4ab337 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c | |||
@@ -176,7 +176,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) | |||
176 | * we set it now, so we can trap and pass that trap to the Guest if it | 176 | * we set it now, so we can trap and pass that trap to the Guest if it |
177 | * uses the FPU. */ | 177 | * uses the FPU. */ |
178 | if (cpu->ts) | 178 | if (cpu->ts) |
179 | lguest_set_ts(); | 179 | unlazy_fpu(current); |
180 | 180 | ||
181 | /* SYSENTER is an optimized way of doing system calls. We can't allow | 181 | /* SYSENTER is an optimized way of doing system calls. We can't allow |
182 | * it because it always jumps to privilege level 0. A normal Guest | 182 | * it because it always jumps to privilege level 0. A normal Guest |
@@ -196,6 +196,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) | |||
196 | * trap made the switcher code come back, and an error code which some | 196 | * trap made the switcher code come back, and an error code which some |
197 | * traps set. */ | 197 | * traps set. */ |
198 | 198 | ||
199 | /* Restore SYSENTER if it's supposed to be on. */ | ||
200 | if (boot_cpu_has(X86_FEATURE_SEP)) | ||
201 | wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); | ||
202 | |||
199 | /* If the Guest page faulted, then the cr2 register will tell us the | 203 | /* If the Guest page faulted, then the cr2 register will tell us the |
200 | * bad virtual address. We have to grab this now, because once we | 204 | * bad virtual address. We have to grab this now, because once we |
201 | * re-enable interrupts an interrupt could fault and thus overwrite | 205 | * re-enable interrupts an interrupt could fault and thus overwrite |
@@ -203,13 +207,12 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) | |||
203 | if (cpu->regs->trapnum == 14) | 207 | if (cpu->regs->trapnum == 14) |
204 | cpu->arch.last_pagefault = read_cr2(); | 208 | cpu->arch.last_pagefault = read_cr2(); |
205 | /* Similarly, if we took a trap because the Guest used the FPU, | 209 | /* Similarly, if we took a trap because the Guest used the FPU, |
206 | * we have to restore the FPU it expects to see. */ | 210 | * we have to restore the FPU it expects to see. |
211 | * math_state_restore() may sleep and we may even move off to | ||
212 | * a different CPU. So all the critical stuff should be done | ||
213 | * before this. */ | ||
207 | else if (cpu->regs->trapnum == 7) | 214 | else if (cpu->regs->trapnum == 7) |
208 | math_state_restore(); | 215 | math_state_restore(); |
209 | |||
210 | /* Restore SYSENTER if it's supposed to be on. */ | ||
211 | if (boot_cpu_has(X86_FEATURE_SEP)) | ||
212 | wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); | ||
213 | } | 216 | } |
214 | 217 | ||
215 | /*H:130 Now we've examined the hypercall code; our Guest can make requests. | 218 | /*H:130 Now we've examined the hypercall code; our Guest can make requests. |
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index 8662a6b7a30b..25b352b664d9 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile | |||
@@ -68,7 +68,6 @@ obj-$(CONFIG_WAFER_WDT) += wafer5823wdt.o | |||
68 | obj-$(CONFIG_I6300ESB_WDT) += i6300esb.o | 68 | obj-$(CONFIG_I6300ESB_WDT) += i6300esb.o |
69 | obj-$(CONFIG_ITCO_WDT) += iTCO_wdt.o iTCO_vendor_support.o | 69 | obj-$(CONFIG_ITCO_WDT) += iTCO_wdt.o iTCO_vendor_support.o |
70 | obj-$(CONFIG_IT8712F_WDT) += it8712f_wdt.o | 70 | obj-$(CONFIG_IT8712F_WDT) += it8712f_wdt.o |
71 | CFLAGS_hpwdt.o += -O | ||
72 | obj-$(CONFIG_HP_WATCHDOG) += hpwdt.o | 71 | obj-$(CONFIG_HP_WATCHDOG) += hpwdt.o |
73 | obj-$(CONFIG_SC1200_WDT) += sc1200wdt.o | 72 | obj-$(CONFIG_SC1200_WDT) += sc1200wdt.o |
74 | obj-$(CONFIG_SCx200_WDT) += scx200_wdt.o | 73 | obj-$(CONFIG_SCx200_WDT) += scx200_wdt.o |
diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 4f0f22b020ea..76e5b7386af9 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c | |||
@@ -529,7 +529,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) | |||
529 | 529 | ||
530 | #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ | 530 | #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ |
531 | /* Clear master flag /before/ clearing selector flag. */ | 531 | /* Clear master flag /before/ clearing selector flag. */ |
532 | rmb(); | 532 | wmb(); |
533 | #endif | 533 | #endif |
534 | pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); | 534 | pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); |
535 | while (pending_words != 0) { | 535 | while (pending_words != 0) { |
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index c19184f2e70e..bec76b1c2bb0 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c | |||
@@ -246,15 +246,11 @@ static void find_metapath(const struct gfs2_sbd *sdp, u64 block, | |||
246 | 246 | ||
247 | } | 247 | } |
248 | 248 | ||
249 | static inline unsigned int zero_metapath_length(const struct metapath *mp, | 249 | static inline unsigned int metapath_branch_start(const struct metapath *mp) |
250 | unsigned height) | ||
251 | { | 250 | { |
252 | unsigned int i; | 251 | if (mp->mp_list[0] == 0) |
253 | for (i = 0; i < height - 1; i++) { | 252 | return 2; |
254 | if (mp->mp_list[i] != 0) | 253 | return 1; |
255 | return i; | ||
256 | } | ||
257 | return height; | ||
258 | } | 254 | } |
259 | 255 | ||
260 | /** | 256 | /** |
@@ -436,7 +432,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, | |||
436 | struct gfs2_sbd *sdp = GFS2_SB(inode); | 432 | struct gfs2_sbd *sdp = GFS2_SB(inode); |
437 | struct buffer_head *dibh = mp->mp_bh[0]; | 433 | struct buffer_head *dibh = mp->mp_bh[0]; |
438 | u64 bn, dblock = 0; | 434 | u64 bn, dblock = 0; |
439 | unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0; | 435 | unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; |
440 | unsigned dblks = 0; | 436 | unsigned dblks = 0; |
441 | unsigned ptrs_per_blk; | 437 | unsigned ptrs_per_blk; |
442 | const unsigned end_of_metadata = height - 1; | 438 | const unsigned end_of_metadata = height - 1; |
@@ -471,9 +467,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, | |||
471 | /* Building up tree height */ | 467 | /* Building up tree height */ |
472 | state = ALLOC_GROW_HEIGHT; | 468 | state = ALLOC_GROW_HEIGHT; |
473 | iblks = height - ip->i_height; | 469 | iblks = height - ip->i_height; |
474 | zmpl = zero_metapath_length(mp, height); | 470 | branch_start = metapath_branch_start(mp); |
475 | iblks -= zmpl; | 471 | iblks += (height - branch_start); |
476 | iblks += height; | ||
477 | } | 472 | } |
478 | } | 473 | } |
479 | 474 | ||
@@ -509,13 +504,13 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, | |||
509 | sizeof(struct gfs2_meta_header)); | 504 | sizeof(struct gfs2_meta_header)); |
510 | *ptr = zero_bn; | 505 | *ptr = zero_bn; |
511 | state = ALLOC_GROW_DEPTH; | 506 | state = ALLOC_GROW_DEPTH; |
512 | for(i = zmpl; i < height; i++) { | 507 | for(i = branch_start; i < height; i++) { |
513 | if (mp->mp_bh[i] == NULL) | 508 | if (mp->mp_bh[i] == NULL) |
514 | break; | 509 | break; |
515 | brelse(mp->mp_bh[i]); | 510 | brelse(mp->mp_bh[i]); |
516 | mp->mp_bh[i] = NULL; | 511 | mp->mp_bh[i] = NULL; |
517 | } | 512 | } |
518 | i = zmpl; | 513 | i = branch_start; |
519 | } | 514 | } |
520 | if (n == 0) | 515 | if (n == 0) |
521 | break; | 516 | break; |
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 6387523a3153..3401628d742b 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c | |||
@@ -195,7 +195,7 @@ ulong_aligned: | |||
195 | depending on architecture. I've experimented with several ways | 195 | depending on architecture. I've experimented with several ways |
196 | of writing this section such as using an else before the goto | 196 | of writing this section such as using an else before the goto |
197 | but this one seems to be the fastest. */ | 197 | but this one seems to be the fastest. */ |
198 | while ((unsigned char *)plong < end - 1) { | 198 | while ((unsigned char *)plong < end - sizeof(unsigned long)) { |
199 | prefetch(plong + 1); | 199 | prefetch(plong + 1); |
200 | if (((*plong) & LBITMASK) != lskipval) | 200 | if (((*plong) & LBITMASK) != lskipval) |
201 | break; | 201 | break; |
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 49c7cd0502cc..779d2eb649c5 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c | |||
@@ -130,10 +130,11 @@ static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, | |||
130 | struct mnt_fhstatus *res) | 130 | struct mnt_fhstatus *res) |
131 | { | 131 | { |
132 | struct nfs_fh *fh = res->fh; | 132 | struct nfs_fh *fh = res->fh; |
133 | unsigned size; | ||
133 | 134 | ||
134 | if ((res->status = ntohl(*p++)) == 0) { | 135 | if ((res->status = ntohl(*p++)) == 0) { |
135 | int size = ntohl(*p++); | 136 | size = ntohl(*p++); |
136 | if (size <= NFS3_FHSIZE) { | 137 | if (size <= NFS3_FHSIZE && size != 0) { |
137 | fh->size = size; | 138 | fh->size = size; |
138 | memcpy(fh->data, p, size); | 139 | memcpy(fh->data, p, size); |
139 | } else | 140 | } else |
diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2a4a024a4e7b..614efeed5437 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c | |||
@@ -1216,8 +1216,6 @@ static int nfs_validate_mount_data(void *options, | |||
1216 | { | 1216 | { |
1217 | struct nfs_mount_data *data = (struct nfs_mount_data *)options; | 1217 | struct nfs_mount_data *data = (struct nfs_mount_data *)options; |
1218 | 1218 | ||
1219 | memset(args, 0, sizeof(*args)); | ||
1220 | |||
1221 | if (data == NULL) | 1219 | if (data == NULL) |
1222 | goto out_no_data; | 1220 | goto out_no_data; |
1223 | 1221 | ||
@@ -1251,13 +1249,13 @@ static int nfs_validate_mount_data(void *options, | |||
1251 | case 5: | 1249 | case 5: |
1252 | memset(data->context, 0, sizeof(data->context)); | 1250 | memset(data->context, 0, sizeof(data->context)); |
1253 | case 6: | 1251 | case 6: |
1254 | if (data->flags & NFS_MOUNT_VER3) | 1252 | if (data->flags & NFS_MOUNT_VER3) { |
1253 | if (data->root.size > NFS3_FHSIZE || data->root.size == 0) | ||
1254 | goto out_invalid_fh; | ||
1255 | mntfh->size = data->root.size; | 1255 | mntfh->size = data->root.size; |
1256 | else | 1256 | } else |
1257 | mntfh->size = NFS2_FHSIZE; | 1257 | mntfh->size = NFS2_FHSIZE; |
1258 | 1258 | ||
1259 | if (mntfh->size > sizeof(mntfh->data)) | ||
1260 | goto out_invalid_fh; | ||
1261 | 1259 | ||
1262 | memcpy(mntfh->data, data->root.data, mntfh->size); | 1260 | memcpy(mntfh->data, data->root.data, mntfh->size); |
1263 | if (mntfh->size < sizeof(mntfh->data)) | 1261 | if (mntfh->size < sizeof(mntfh->data)) |
@@ -1585,24 +1583,29 @@ static int nfs_get_sb(struct file_system_type *fs_type, | |||
1585 | { | 1583 | { |
1586 | struct nfs_server *server = NULL; | 1584 | struct nfs_server *server = NULL; |
1587 | struct super_block *s; | 1585 | struct super_block *s; |
1588 | struct nfs_fh mntfh; | 1586 | struct nfs_parsed_mount_data *data; |
1589 | struct nfs_parsed_mount_data data; | 1587 | struct nfs_fh *mntfh; |
1590 | struct dentry *mntroot; | 1588 | struct dentry *mntroot; |
1591 | int (*compare_super)(struct super_block *, void *) = nfs_compare_super; | 1589 | int (*compare_super)(struct super_block *, void *) = nfs_compare_super; |
1592 | struct nfs_sb_mountdata sb_mntdata = { | 1590 | struct nfs_sb_mountdata sb_mntdata = { |
1593 | .mntflags = flags, | 1591 | .mntflags = flags, |
1594 | }; | 1592 | }; |
1595 | int error; | 1593 | int error = -ENOMEM; |
1594 | |||
1595 | data = kzalloc(sizeof(*data), GFP_KERNEL); | ||
1596 | mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); | ||
1597 | if (data == NULL || mntfh == NULL) | ||
1598 | goto out_free_fh; | ||
1596 | 1599 | ||
1597 | security_init_mnt_opts(&data.lsm_opts); | 1600 | security_init_mnt_opts(&data->lsm_opts); |
1598 | 1601 | ||
1599 | /* Validate the mount data */ | 1602 | /* Validate the mount data */ |
1600 | error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name); | 1603 | error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name); |
1601 | if (error < 0) | 1604 | if (error < 0) |
1602 | goto out; | 1605 | goto out; |
1603 | 1606 | ||
1604 | /* Get a volume representation */ | 1607 | /* Get a volume representation */ |
1605 | server = nfs_create_server(&data, &mntfh); | 1608 | server = nfs_create_server(data, mntfh); |
1606 | if (IS_ERR(server)) { | 1609 | if (IS_ERR(server)) { |
1607 | error = PTR_ERR(server); | 1610 | error = PTR_ERR(server); |
1608 | goto out; | 1611 | goto out; |
@@ -1630,16 +1633,16 @@ static int nfs_get_sb(struct file_system_type *fs_type, | |||
1630 | 1633 | ||
1631 | if (!s->s_root) { | 1634 | if (!s->s_root) { |
1632 | /* initial superblock/root creation */ | 1635 | /* initial superblock/root creation */ |
1633 | nfs_fill_super(s, &data); | 1636 | nfs_fill_super(s, data); |
1634 | } | 1637 | } |
1635 | 1638 | ||
1636 | mntroot = nfs_get_root(s, &mntfh); | 1639 | mntroot = nfs_get_root(s, mntfh); |
1637 | if (IS_ERR(mntroot)) { | 1640 | if (IS_ERR(mntroot)) { |
1638 | error = PTR_ERR(mntroot); | 1641 | error = PTR_ERR(mntroot); |
1639 | goto error_splat_super; | 1642 | goto error_splat_super; |
1640 | } | 1643 | } |
1641 | 1644 | ||
1642 | error = security_sb_set_mnt_opts(s, &data.lsm_opts); | 1645 | error = security_sb_set_mnt_opts(s, &data->lsm_opts); |
1643 | if (error) | 1646 | if (error) |
1644 | goto error_splat_root; | 1647 | goto error_splat_root; |
1645 | 1648 | ||
@@ -1649,9 +1652,12 @@ static int nfs_get_sb(struct file_system_type *fs_type, | |||
1649 | error = 0; | 1652 | error = 0; |
1650 | 1653 | ||
1651 | out: | 1654 | out: |
1652 | kfree(data.nfs_server.hostname); | 1655 | kfree(data->nfs_server.hostname); |
1653 | kfree(data.mount_server.hostname); | 1656 | kfree(data->mount_server.hostname); |
1654 | security_free_mnt_opts(&data.lsm_opts); | 1657 | security_free_mnt_opts(&data->lsm_opts); |
1658 | out_free_fh: | ||
1659 | kfree(mntfh); | ||
1660 | kfree(data); | ||
1655 | return error; | 1661 | return error; |
1656 | 1662 | ||
1657 | out_err_nosb: | 1663 | out_err_nosb: |
@@ -1800,8 +1806,6 @@ static int nfs4_validate_mount_data(void *options, | |||
1800 | struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; | 1806 | struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; |
1801 | char *c; | 1807 | char *c; |
1802 | 1808 | ||
1803 | memset(args, 0, sizeof(*args)); | ||
1804 | |||
1805 | if (data == NULL) | 1809 | if (data == NULL) |
1806 | goto out_no_data; | 1810 | goto out_no_data; |
1807 | 1811 | ||
@@ -1959,26 +1963,31 @@ out_no_client_address: | |||
1959 | static int nfs4_get_sb(struct file_system_type *fs_type, | 1963 | static int nfs4_get_sb(struct file_system_type *fs_type, |
1960 | int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) | 1964 | int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) |
1961 | { | 1965 | { |
1962 | struct nfs_parsed_mount_data data; | 1966 | struct nfs_parsed_mount_data *data; |
1963 | struct super_block *s; | 1967 | struct super_block *s; |
1964 | struct nfs_server *server; | 1968 | struct nfs_server *server; |
1965 | struct nfs_fh mntfh; | 1969 | struct nfs_fh *mntfh; |
1966 | struct dentry *mntroot; | 1970 | struct dentry *mntroot; |
1967 | int (*compare_super)(struct super_block *, void *) = nfs_compare_super; | 1971 | int (*compare_super)(struct super_block *, void *) = nfs_compare_super; |
1968 | struct nfs_sb_mountdata sb_mntdata = { | 1972 | struct nfs_sb_mountdata sb_mntdata = { |
1969 | .mntflags = flags, | 1973 | .mntflags = flags, |
1970 | }; | 1974 | }; |
1971 | int error; | 1975 | int error = -ENOMEM; |
1972 | 1976 | ||
1973 | security_init_mnt_opts(&data.lsm_opts); | 1977 | data = kzalloc(sizeof(*data), GFP_KERNEL); |
1978 | mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); | ||
1979 | if (data == NULL || mntfh == NULL) | ||
1980 | goto out_free_fh; | ||
1981 | |||
1982 | security_init_mnt_opts(&data->lsm_opts); | ||
1974 | 1983 | ||
1975 | /* Validate the mount data */ | 1984 | /* Validate the mount data */ |
1976 | error = nfs4_validate_mount_data(raw_data, &data, dev_name); | 1985 | error = nfs4_validate_mount_data(raw_data, data, dev_name); |
1977 | if (error < 0) | 1986 | if (error < 0) |
1978 | goto out; | 1987 | goto out; |
1979 | 1988 | ||
1980 | /* Get a volume representation */ | 1989 | /* Get a volume representation */ |
1981 | server = nfs4_create_server(&data, &mntfh); | 1990 | server = nfs4_create_server(data, mntfh); |
1982 | if (IS_ERR(server)) { | 1991 | if (IS_ERR(server)) { |
1983 | error = PTR_ERR(server); | 1992 | error = PTR_ERR(server); |
1984 | goto out; | 1993 | goto out; |
@@ -2009,13 +2018,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type, | |||
2009 | nfs4_fill_super(s); | 2018 | nfs4_fill_super(s); |
2010 | } | 2019 | } |
2011 | 2020 | ||
2012 | mntroot = nfs4_get_root(s, &mntfh); | 2021 | mntroot = nfs4_get_root(s, mntfh); |
2013 | if (IS_ERR(mntroot)) { | 2022 | if (IS_ERR(mntroot)) { |
2014 | error = PTR_ERR(mntroot); | 2023 | error = PTR_ERR(mntroot); |
2015 | goto error_splat_super; | 2024 | goto error_splat_super; |
2016 | } | 2025 | } |
2017 | 2026 | ||
2018 | error = security_sb_set_mnt_opts(s, &data.lsm_opts); | 2027 | error = security_sb_set_mnt_opts(s, &data->lsm_opts); |
2019 | if (error) | 2028 | if (error) |
2020 | goto error_splat_root; | 2029 | goto error_splat_root; |
2021 | 2030 | ||
@@ -2025,10 +2034,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type, | |||
2025 | error = 0; | 2034 | error = 0; |
2026 | 2035 | ||
2027 | out: | 2036 | out: |
2028 | kfree(data.client_address); | 2037 | kfree(data->client_address); |
2029 | kfree(data.nfs_server.export_path); | 2038 | kfree(data->nfs_server.export_path); |
2030 | kfree(data.nfs_server.hostname); | 2039 | kfree(data->nfs_server.hostname); |
2031 | security_free_mnt_opts(&data.lsm_opts); | 2040 | security_free_mnt_opts(&data->lsm_opts); |
2041 | out_free_fh: | ||
2042 | kfree(mntfh); | ||
2043 | kfree(data); | ||
2032 | return error; | 2044 | return error; |
2033 | 2045 | ||
2034 | out_free: | 2046 | out_free: |
diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 6d8ace3e3259..f333848fd3be 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c | |||
@@ -739,12 +739,13 @@ int nfs_updatepage(struct file *file, struct page *page, | |||
739 | } | 739 | } |
740 | 740 | ||
741 | status = nfs_writepage_setup(ctx, page, offset, count); | 741 | status = nfs_writepage_setup(ctx, page, offset, count); |
742 | __set_page_dirty_nobuffers(page); | 742 | if (status < 0) |
743 | nfs_set_pageerror(page); | ||
744 | else | ||
745 | __set_page_dirty_nobuffers(page); | ||
743 | 746 | ||
744 | dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n", | 747 | dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n", |
745 | status, (long long)i_size_read(inode)); | 748 | status, (long long)i_size_read(inode)); |
746 | if (status < 0) | ||
747 | nfs_set_pageerror(page); | ||
748 | return status; | 749 | return status; |
749 | } | 750 | } |
750 | 751 | ||
diff --git a/fs/select.c b/fs/select.c index 8dda969614a9..da0e88201c3a 100644 --- a/fs/select.c +++ b/fs/select.c | |||
@@ -249,7 +249,6 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) | |||
249 | retval++; | 249 | retval++; |
250 | } | 250 | } |
251 | } | 251 | } |
252 | cond_resched(); | ||
253 | } | 252 | } |
254 | if (res_in) | 253 | if (res_in) |
255 | *rinp = res_in; | 254 | *rinp = res_in; |
@@ -257,6 +256,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) | |||
257 | *routp = res_out; | 256 | *routp = res_out; |
258 | if (res_ex) | 257 | if (res_ex) |
259 | *rexp = res_ex; | 258 | *rexp = res_ex; |
259 | cond_resched(); | ||
260 | } | 260 | } |
261 | wait = NULL; | 261 | wait = NULL; |
262 | if (retval || !*timeout || signal_pending(current)) | 262 | if (retval || !*timeout || signal_pending(current)) |
diff --git a/include/asm-alpha/percpu.h b/include/asm-alpha/percpu.h index 82e8a94b4b2f..3495e8e00d70 100644 --- a/include/asm-alpha/percpu.h +++ b/include/asm-alpha/percpu.h | |||
@@ -69,6 +69,8 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; | |||
69 | #define __get_cpu_var(var) per_cpu_var(var) | 69 | #define __get_cpu_var(var) per_cpu_var(var) |
70 | #define __raw_get_cpu_var(var) per_cpu_var(var) | 70 | #define __raw_get_cpu_var(var) per_cpu_var(var) |
71 | 71 | ||
72 | #define PER_CPU_ATTRIBUTES | ||
73 | |||
72 | #endif /* SMP */ | 74 | #endif /* SMP */ |
73 | 75 | ||
74 | #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu_var(name) | 76 | #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu_var(name) |
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 1d8cd01fa514..844f2a89afbc 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/kvm_para.h> | 18 | #include <linux/kvm_para.h> |
19 | #include <linux/kvm_types.h> | 19 | #include <linux/kvm_types.h> |
20 | 20 | ||
21 | #include <asm/pvclock-abi.h> | ||
21 | #include <asm/desc.h> | 22 | #include <asm/desc.h> |
22 | 23 | ||
23 | #define KVM_MAX_VCPUS 16 | 24 | #define KVM_MAX_VCPUS 16 |
@@ -282,7 +283,8 @@ struct kvm_vcpu_arch { | |||
282 | struct x86_emulate_ctxt emulate_ctxt; | 283 | struct x86_emulate_ctxt emulate_ctxt; |
283 | 284 | ||
284 | gpa_t time; | 285 | gpa_t time; |
285 | struct kvm_vcpu_time_info hv_clock; | 286 | struct pvclock_vcpu_time_info hv_clock; |
287 | unsigned int hv_clock_tsc_khz; | ||
286 | unsigned int time_offset; | 288 | unsigned int time_offset; |
287 | struct page *time_page; | 289 | struct page *time_page; |
288 | }; | 290 | }; |
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h index 509845942070..bfd9900742bf 100644 --- a/include/asm-x86/kvm_para.h +++ b/include/asm-x86/kvm_para.h | |||
@@ -48,24 +48,6 @@ struct kvm_mmu_op_release_pt { | |||
48 | #ifdef __KERNEL__ | 48 | #ifdef __KERNEL__ |
49 | #include <asm/processor.h> | 49 | #include <asm/processor.h> |
50 | 50 | ||
51 | /* xen binary-compatible interface. See xen headers for details */ | ||
52 | struct kvm_vcpu_time_info { | ||
53 | uint32_t version; | ||
54 | uint32_t pad0; | ||
55 | uint64_t tsc_timestamp; | ||
56 | uint64_t system_time; | ||
57 | uint32_t tsc_to_system_mul; | ||
58 | int8_t tsc_shift; | ||
59 | int8_t pad[3]; | ||
60 | } __attribute__((__packed__)); /* 32 bytes */ | ||
61 | |||
62 | struct kvm_wall_clock { | ||
63 | uint32_t wc_version; | ||
64 | uint32_t wc_sec; | ||
65 | uint32_t wc_nsec; | ||
66 | } __attribute__((__packed__)); | ||
67 | |||
68 | |||
69 | extern void kvmclock_init(void); | 51 | extern void kvmclock_init(void); |
70 | 52 | ||
71 | 53 | ||
diff --git a/include/asm-x86/pvclock-abi.h b/include/asm-x86/pvclock-abi.h new file mode 100644 index 000000000000..6857f840b243 --- /dev/null +++ b/include/asm-x86/pvclock-abi.h | |||
@@ -0,0 +1,42 @@ | |||
1 | #ifndef _ASM_X86_PVCLOCK_ABI_H_ | ||
2 | #define _ASM_X86_PVCLOCK_ABI_H_ | ||
3 | #ifndef __ASSEMBLY__ | ||
4 | |||
5 | /* | ||
6 | * These structs MUST NOT be changed. | ||
7 | * They are the ABI between hypervisor and guest OS. | ||
8 | * Both Xen and KVM are using this. | ||
9 | * | ||
10 | * pvclock_vcpu_time_info holds the system time and the tsc timestamp | ||
11 | * of the last update. So the guest can use the tsc delta to get a | ||
12 | * more precise system time. There is one per virtual cpu. | ||
13 | * | ||
14 | * pvclock_wall_clock references the point in time when the system | ||
15 | * time was zero (usually boot time), thus the guest calculates the | ||
16 | * current wall clock by adding the system time. | ||
17 | * | ||
18 | * Protocol for the "version" fields is: hypervisor raises it (making | ||
19 | * it uneven) before it starts updating the fields and raises it again | ||
20 | * (making it even) when it is done. Thus the guest can make sure the | ||
21 | * time values it got are consistent by checking the version before | ||
22 | * and after reading them. | ||
23 | */ | ||
24 | |||
25 | struct pvclock_vcpu_time_info { | ||
26 | u32 version; | ||
27 | u32 pad0; | ||
28 | u64 tsc_timestamp; | ||
29 | u64 system_time; | ||
30 | u32 tsc_to_system_mul; | ||
31 | s8 tsc_shift; | ||
32 | u8 pad[3]; | ||
33 | } __attribute__((__packed__)); /* 32 bytes */ | ||
34 | |||
35 | struct pvclock_wall_clock { | ||
36 | u32 version; | ||
37 | u32 sec; | ||
38 | u32 nsec; | ||
39 | } __attribute__((__packed__)); | ||
40 | |||
41 | #endif /* __ASSEMBLY__ */ | ||
42 | #endif /* _ASM_X86_PVCLOCK_ABI_H_ */ | ||
diff --git a/include/asm-x86/pvclock.h b/include/asm-x86/pvclock.h new file mode 100644 index 000000000000..85b1bba8e0a3 --- /dev/null +++ b/include/asm-x86/pvclock.h | |||
@@ -0,0 +1,13 @@ | |||
1 | #ifndef _ASM_X86_PVCLOCK_H_ | ||
2 | #define _ASM_X86_PVCLOCK_H_ | ||
3 | |||
4 | #include <linux/clocksource.h> | ||
5 | #include <asm/pvclock-abi.h> | ||
6 | |||
7 | /* some helper functions for xen and kvm pv clock sources */ | ||
8 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); | ||
9 | void pvclock_read_wallclock(struct pvclock_wall_clock *wall, | ||
10 | struct pvclock_vcpu_time_info *vcpu, | ||
11 | struct timespec *ts); | ||
12 | |||
13 | #endif /* _ASM_X86_PVCLOCK_H_ */ | ||
diff --git a/include/asm-x86/xen/page.h b/include/asm-x86/xen/page.h index baf3a4dce28c..e11f24038b1d 100644 --- a/include/asm-x86/xen/page.h +++ b/include/asm-x86/xen/page.h | |||
@@ -150,13 +150,9 @@ static inline pte_t __pte_ma(pteval_t x) | |||
150 | return (pte_t) { .pte = x }; | 150 | return (pte_t) { .pte = x }; |
151 | } | 151 | } |
152 | 152 | ||
153 | #ifdef CONFIG_X86_PAE | ||
154 | #define pmd_val_ma(v) ((v).pmd) | 153 | #define pmd_val_ma(v) ((v).pmd) |
155 | #define pud_val_ma(v) ((v).pgd.pgd) | 154 | #define pud_val_ma(v) ((v).pgd.pgd) |
156 | #define __pmd_ma(x) ((pmd_t) { (x) } ) | 155 | #define __pmd_ma(x) ((pmd_t) { (x) } ) |
157 | #else /* !X86_PAE */ | ||
158 | #define pmd_val_ma(v) ((v).pud.pgd.pgd) | ||
159 | #endif /* CONFIG_X86_PAE */ | ||
160 | 156 | ||
161 | #define pgd_val_ma(x) ((x).pgd) | 157 | #define pgd_val_ma(x) ((x).pgd) |
162 | 158 | ||
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 092b1b25291d..de9d1df4bba2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
@@ -33,6 +33,7 @@ | |||
33 | #define KVM_REQ_REPORT_TPR_ACCESS 2 | 33 | #define KVM_REQ_REPORT_TPR_ACCESS 2 |
34 | #define KVM_REQ_MMU_RELOAD 3 | 34 | #define KVM_REQ_MMU_RELOAD 3 |
35 | #define KVM_REQ_TRIPLE_FAULT 4 | 35 | #define KVM_REQ_TRIPLE_FAULT 4 |
36 | #define KVM_REQ_PENDING_TIMER 5 | ||
36 | 37 | ||
37 | struct kvm_vcpu; | 38 | struct kvm_vcpu; |
38 | extern struct kmem_cache *kvm_vcpu_cache; | 39 | extern struct kmem_cache *kvm_vcpu_cache; |
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 59f1c0bd8f9c..d2a003586761 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h | |||
@@ -27,8 +27,7 @@ | |||
27 | * This routine is called by the kernel to write a series of | 27 | * This routine is called by the kernel to write a series of |
28 | * characters to the tty device. The characters may come from | 28 | * characters to the tty device. The characters may come from |
29 | * user space or kernel space. This routine will return the | 29 | * user space or kernel space. This routine will return the |
30 | * number of characters actually accepted for writing. This | 30 | * number of characters actually accepted for writing. |
31 | * routine is mandatory. | ||
32 | * | 31 | * |
33 | * Optional: Required for writable devices. | 32 | * Optional: Required for writable devices. |
34 | * | 33 | * |
@@ -134,7 +133,7 @@ | |||
134 | * This routine notifies the tty driver that it should hangup the | 133 | * This routine notifies the tty driver that it should hangup the |
135 | * tty device. | 134 | * tty device. |
136 | * | 135 | * |
137 | * Required: | 136 | * Optional: |
138 | * | 137 | * |
139 | * void (*break_ctl)(struct tty_stuct *tty, int state); | 138 | * void (*break_ctl)(struct tty_stuct *tty, int state); |
140 | * | 139 | * |
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 9b018da48cf3..819a0331cda9 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h | |||
@@ -10,6 +10,7 @@ | |||
10 | #define __XEN_PUBLIC_XEN_H__ | 10 | #define __XEN_PUBLIC_XEN_H__ |
11 | 11 | ||
12 | #include <asm/xen/interface.h> | 12 | #include <asm/xen/interface.h> |
13 | #include <asm/pvclock-abi.h> | ||
13 | 14 | ||
14 | /* | 15 | /* |
15 | * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS). | 16 | * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS). |
@@ -336,7 +337,7 @@ struct vcpu_info { | |||
336 | uint8_t evtchn_upcall_mask; | 337 | uint8_t evtchn_upcall_mask; |
337 | unsigned long evtchn_pending_sel; | 338 | unsigned long evtchn_pending_sel; |
338 | struct arch_vcpu_info arch; | 339 | struct arch_vcpu_info arch; |
339 | struct vcpu_time_info time; | 340 | struct pvclock_vcpu_time_info time; |
340 | }; /* 64 bytes (x86) */ | 341 | }; /* 64 bytes (x86) */ |
341 | 342 | ||
342 | /* | 343 | /* |
@@ -384,9 +385,7 @@ struct shared_info { | |||
384 | * Wallclock time: updated only by control software. Guests should base | 385 | * Wallclock time: updated only by control software. Guests should base |
385 | * their gettimeofday() syscall on this wallclock-base value. | 386 | * their gettimeofday() syscall on this wallclock-base value. |
386 | */ | 387 | */ |
387 | uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */ | 388 | struct pvclock_wall_clock wc; |
388 | uint32_t wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */ | ||
389 | uint32_t wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */ | ||
390 | 389 | ||
391 | struct arch_shared_info arch; | 390 | struct arch_shared_info arch; |
392 | 391 | ||
diff --git a/kernel/futex.c b/kernel/futex.c index 449def8074fe..7d1136e97c14 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -1096,21 +1096,64 @@ static void unqueue_me_pi(struct futex_q *q) | |||
1096 | * private futexes. | 1096 | * private futexes. |
1097 | */ | 1097 | */ |
1098 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | 1098 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, |
1099 | struct task_struct *newowner) | 1099 | struct task_struct *newowner, |
1100 | struct rw_semaphore *fshared) | ||
1100 | { | 1101 | { |
1101 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; | 1102 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; |
1102 | struct futex_pi_state *pi_state = q->pi_state; | 1103 | struct futex_pi_state *pi_state = q->pi_state; |
1104 | struct task_struct *oldowner = pi_state->owner; | ||
1103 | u32 uval, curval, newval; | 1105 | u32 uval, curval, newval; |
1104 | int ret; | 1106 | int ret, attempt = 0; |
1105 | 1107 | ||
1106 | /* Owner died? */ | 1108 | /* Owner died? */ |
1109 | if (!pi_state->owner) | ||
1110 | newtid |= FUTEX_OWNER_DIED; | ||
1111 | |||
1112 | /* | ||
1113 | * We are here either because we stole the rtmutex from the | ||
1114 | * pending owner or we are the pending owner which failed to | ||
1115 | * get the rtmutex. We have to replace the pending owner TID | ||
1116 | * in the user space variable. This must be atomic as we have | ||
1117 | * to preserve the owner died bit here. | ||
1118 | * | ||
1119 | * Note: We write the user space value _before_ changing the | ||
1120 | * pi_state because we can fault here. Imagine swapped out | ||
1121 | * pages or a fork, which was running right before we acquired | ||
1122 | * mmap_sem, that marked all the anonymous memory readonly for | ||
1123 | * cow. | ||
1124 | * | ||
1125 | * Modifying pi_state _before_ the user space value would | ||
1126 | * leave the pi_state in an inconsistent state when we fault | ||
1127 | * here, because we need to drop the hash bucket lock to | ||
1128 | * handle the fault. This might be observed in the PID check | ||
1129 | * in lookup_pi_state. | ||
1130 | */ | ||
1131 | retry: | ||
1132 | if (get_futex_value_locked(&uval, uaddr)) | ||
1133 | goto handle_fault; | ||
1134 | |||
1135 | while (1) { | ||
1136 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | ||
1137 | |||
1138 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | ||
1139 | |||
1140 | if (curval == -EFAULT) | ||
1141 | goto handle_fault; | ||
1142 | if (curval == uval) | ||
1143 | break; | ||
1144 | uval = curval; | ||
1145 | } | ||
1146 | |||
1147 | /* | ||
1148 | * We fixed up user space. Now we need to fix the pi_state | ||
1149 | * itself. | ||
1150 | */ | ||
1107 | if (pi_state->owner != NULL) { | 1151 | if (pi_state->owner != NULL) { |
1108 | spin_lock_irq(&pi_state->owner->pi_lock); | 1152 | spin_lock_irq(&pi_state->owner->pi_lock); |
1109 | WARN_ON(list_empty(&pi_state->list)); | 1153 | WARN_ON(list_empty(&pi_state->list)); |
1110 | list_del_init(&pi_state->list); | 1154 | list_del_init(&pi_state->list); |
1111 | spin_unlock_irq(&pi_state->owner->pi_lock); | 1155 | spin_unlock_irq(&pi_state->owner->pi_lock); |
1112 | } else | 1156 | } |
1113 | newtid |= FUTEX_OWNER_DIED; | ||
1114 | 1157 | ||
1115 | pi_state->owner = newowner; | 1158 | pi_state->owner = newowner; |
1116 | 1159 | ||
@@ -1118,26 +1161,35 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
1118 | WARN_ON(!list_empty(&pi_state->list)); | 1161 | WARN_ON(!list_empty(&pi_state->list)); |
1119 | list_add(&pi_state->list, &newowner->pi_state_list); | 1162 | list_add(&pi_state->list, &newowner->pi_state_list); |
1120 | spin_unlock_irq(&newowner->pi_lock); | 1163 | spin_unlock_irq(&newowner->pi_lock); |
1164 | return 0; | ||
1121 | 1165 | ||
1122 | /* | 1166 | /* |
1123 | * We own it, so we have to replace the pending owner | 1167 | * To handle the page fault we need to drop the hash bucket |
1124 | * TID. This must be atomic as we have preserve the | 1168 | * lock here. That gives the other task (either the pending |
1125 | * owner died bit here. | 1169 | * owner itself or the task which stole the rtmutex) the |
1170 | * chance to try the fixup of the pi_state. So once we are | ||
1171 | * back from handling the fault we need to check the pi_state | ||
1172 | * after reacquiring the hash bucket lock and before trying to | ||
1173 | * do another fixup. When the fixup has been done already we | ||
1174 | * simply return. | ||
1126 | */ | 1175 | */ |
1127 | ret = get_futex_value_locked(&uval, uaddr); | 1176 | handle_fault: |
1177 | spin_unlock(q->lock_ptr); | ||
1128 | 1178 | ||
1129 | while (!ret) { | 1179 | ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); |
1130 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | ||
1131 | 1180 | ||
1132 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | 1181 | spin_lock(q->lock_ptr); |
1133 | 1182 | ||
1134 | if (curval == -EFAULT) | 1183 | /* |
1135 | ret = -EFAULT; | 1184 | * Check if someone else fixed it for us: |
1136 | if (curval == uval) | 1185 | */ |
1137 | break; | 1186 | if (pi_state->owner != oldowner) |
1138 | uval = curval; | 1187 | return 0; |
1139 | } | 1188 | |
1140 | return ret; | 1189 | if (ret) |
1190 | return ret; | ||
1191 | |||
1192 | goto retry; | ||
1141 | } | 1193 | } |
1142 | 1194 | ||
1143 | /* | 1195 | /* |
@@ -1507,7 +1559,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1507 | * that case: | 1559 | * that case: |
1508 | */ | 1560 | */ |
1509 | if (q.pi_state->owner != curr) | 1561 | if (q.pi_state->owner != curr) |
1510 | ret = fixup_pi_state_owner(uaddr, &q, curr); | 1562 | ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); |
1511 | } else { | 1563 | } else { |
1512 | /* | 1564 | /* |
1513 | * Catch the rare case, where the lock was released | 1565 | * Catch the rare case, where the lock was released |
@@ -1539,7 +1591,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1539 | int res; | 1591 | int res; |
1540 | 1592 | ||
1541 | owner = rt_mutex_owner(&q.pi_state->pi_mutex); | 1593 | owner = rt_mutex_owner(&q.pi_state->pi_mutex); |
1542 | res = fixup_pi_state_owner(uaddr, &q, owner); | 1594 | res = fixup_pi_state_owner(uaddr, &q, owner, |
1595 | fshared); | ||
1543 | 1596 | ||
1544 | /* propagate -EFAULT, if the fixup failed */ | 1597 | /* propagate -EFAULT, if the fixup failed */ |
1545 | if (res) | 1598 | if (res) |
diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 79e3c90113c2..3ec23c3ec97f 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c | |||
@@ -1499,7 +1499,8 @@ int kgdb_nmicallback(int cpu, void *regs) | |||
1499 | return 1; | 1499 | return 1; |
1500 | } | 1500 | } |
1501 | 1501 | ||
1502 | void kgdb_console_write(struct console *co, const char *s, unsigned count) | 1502 | static void kgdb_console_write(struct console *co, const char *s, |
1503 | unsigned count) | ||
1503 | { | 1504 | { |
1504 | unsigned long flags; | 1505 | unsigned long flags; |
1505 | 1506 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 8d7c246ab864..7c0e7063c11f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -4398,22 +4398,20 @@ do_wait_for_common(struct completion *x, long timeout, int state) | |||
4398 | signal_pending(current)) || | 4398 | signal_pending(current)) || |
4399 | (state == TASK_KILLABLE && | 4399 | (state == TASK_KILLABLE && |
4400 | fatal_signal_pending(current))) { | 4400 | fatal_signal_pending(current))) { |
4401 | __remove_wait_queue(&x->wait, &wait); | 4401 | timeout = -ERESTARTSYS; |
4402 | return -ERESTARTSYS; | 4402 | break; |
4403 | } | 4403 | } |
4404 | __set_current_state(state); | 4404 | __set_current_state(state); |
4405 | spin_unlock_irq(&x->wait.lock); | 4405 | spin_unlock_irq(&x->wait.lock); |
4406 | timeout = schedule_timeout(timeout); | 4406 | timeout = schedule_timeout(timeout); |
4407 | spin_lock_irq(&x->wait.lock); | 4407 | spin_lock_irq(&x->wait.lock); |
4408 | if (!timeout) { | 4408 | } while (!x->done && timeout); |
4409 | __remove_wait_queue(&x->wait, &wait); | ||
4410 | return timeout; | ||
4411 | } | ||
4412 | } while (!x->done); | ||
4413 | __remove_wait_queue(&x->wait, &wait); | 4409 | __remove_wait_queue(&x->wait, &wait); |
4410 | if (!x->done) | ||
4411 | return timeout; | ||
4414 | } | 4412 | } |
4415 | x->done--; | 4413 | x->done--; |
4416 | return timeout; | 4414 | return timeout ?: 1; |
4417 | } | 4415 | } |
4418 | 4416 | ||
4419 | static long __sched | 4417 | static long __sched |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1dad5bbb59b6..0f3c19197fa4 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -250,7 +250,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
250 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | 250 | if (rt_rq->rt_time || rt_rq->rt_nr_running) |
251 | idle = 0; | 251 | idle = 0; |
252 | spin_unlock(&rt_rq->rt_runtime_lock); | 252 | spin_unlock(&rt_rq->rt_runtime_lock); |
253 | } | 253 | } else if (rt_rq->rt_nr_running) |
254 | idle = 0; | ||
254 | 255 | ||
255 | if (enqueue) | 256 | if (enqueue) |
256 | sched_rt_rq_enqueue(rt_rq); | 257 | sched_rt_rq_enqueue(rt_rq); |
diff --git a/mm/memory.c b/mm/memory.c index 9aefaae46858..d14b251a25a6 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1045,6 +1045,26 @@ no_page_table: | |||
1045 | return page; | 1045 | return page; |
1046 | } | 1046 | } |
1047 | 1047 | ||
1048 | /* Can we do the FOLL_ANON optimization? */ | ||
1049 | static inline int use_zero_page(struct vm_area_struct *vma) | ||
1050 | { | ||
1051 | /* | ||
1052 | * We don't want to optimize FOLL_ANON for make_pages_present() | ||
1053 | * when it tries to page in a VM_LOCKED region. As to VM_SHARED, | ||
1054 | * we want to get the page from the page tables to make sure | ||
1055 | * that we serialize and update with any other user of that | ||
1056 | * mapping. | ||
1057 | */ | ||
1058 | if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) | ||
1059 | return 0; | ||
1060 | /* | ||
1061 | * And if we have a fault or a nopfn routine, it's not an | ||
1062 | * anonymous region. | ||
1063 | */ | ||
1064 | return !vma->vm_ops || | ||
1065 | (!vma->vm_ops->fault && !vma->vm_ops->nopfn); | ||
1066 | } | ||
1067 | |||
1048 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1068 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
1049 | unsigned long start, int len, int write, int force, | 1069 | unsigned long start, int len, int write, int force, |
1050 | struct page **pages, struct vm_area_struct **vmas) | 1070 | struct page **pages, struct vm_area_struct **vmas) |
@@ -1119,8 +1139,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1119 | foll_flags = FOLL_TOUCH; | 1139 | foll_flags = FOLL_TOUCH; |
1120 | if (pages) | 1140 | if (pages) |
1121 | foll_flags |= FOLL_GET; | 1141 | foll_flags |= FOLL_GET; |
1122 | if (!write && !(vma->vm_flags & VM_LOCKED) && | 1142 | if (!write && use_zero_page(vma)) |
1123 | (!vma->vm_ops || !vma->vm_ops->fault)) | ||
1124 | foll_flags |= FOLL_ANON; | 1143 | foll_flags |= FOLL_ANON; |
1125 | 1144 | ||
1126 | do { | 1145 | do { |
@@ -1766,7 +1785,6 @@ gotten: | |||
1766 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 1785 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
1767 | if (likely(pte_same(*page_table, orig_pte))) { | 1786 | if (likely(pte_same(*page_table, orig_pte))) { |
1768 | if (old_page) { | 1787 | if (old_page) { |
1769 | page_remove_rmap(old_page, vma); | ||
1770 | if (!PageAnon(old_page)) { | 1788 | if (!PageAnon(old_page)) { |
1771 | dec_mm_counter(mm, file_rss); | 1789 | dec_mm_counter(mm, file_rss); |
1772 | inc_mm_counter(mm, anon_rss); | 1790 | inc_mm_counter(mm, anon_rss); |
@@ -1788,6 +1806,32 @@ gotten: | |||
1788 | lru_cache_add_active(new_page); | 1806 | lru_cache_add_active(new_page); |
1789 | page_add_new_anon_rmap(new_page, vma, address); | 1807 | page_add_new_anon_rmap(new_page, vma, address); |
1790 | 1808 | ||
1809 | if (old_page) { | ||
1810 | /* | ||
1811 | * Only after switching the pte to the new page may | ||
1812 | * we remove the mapcount here. Otherwise another | ||
1813 | * process may come and find the rmap count decremented | ||
1814 | * before the pte is switched to the new page, and | ||
1815 | * "reuse" the old page writing into it while our pte | ||
1816 | * here still points into it and can be read by other | ||
1817 | * threads. | ||
1818 | * | ||
1819 | * The critical issue is to order this | ||
1820 | * page_remove_rmap with the ptp_clear_flush above. | ||
1821 | * Those stores are ordered by (if nothing else,) | ||
1822 | * the barrier present in the atomic_add_negative | ||
1823 | * in page_remove_rmap. | ||
1824 | * | ||
1825 | * Then the TLB flush in ptep_clear_flush ensures that | ||
1826 | * no process can access the old page before the | ||
1827 | * decremented mapcount is visible. And the old page | ||
1828 | * cannot be reused until after the decremented | ||
1829 | * mapcount is visible. So transitively, TLBs to | ||
1830 | * old page will be flushed before it can be reused. | ||
1831 | */ | ||
1832 | page_remove_rmap(old_page, vma); | ||
1833 | } | ||
1834 | |||
1791 | /* Free the old page.. */ | 1835 | /* Free the old page.. */ |
1792 | new_page = old_page; | 1836 | new_page = old_page; |
1793 | ret |= VM_FAULT_WRITE; | 1837 | ret |= VM_FAULT_WRITE; |
diff --git a/sound/isa/sb/sb_mixer.c b/sound/isa/sb/sb_mixer.c index 91d14224f6b3..73d4572d136b 100644 --- a/sound/isa/sb/sb_mixer.c +++ b/sound/isa/sb/sb_mixer.c | |||
@@ -925,7 +925,7 @@ static unsigned char als4000_saved_regs[] = { | |||
925 | static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs) | 925 | static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs) |
926 | { | 926 | { |
927 | unsigned char *val = chip->saved_regs; | 927 | unsigned char *val = chip->saved_regs; |
928 | snd_assert(num_regs > ARRAY_SIZE(chip->saved_regs), return); | 928 | snd_assert(num_regs <= ARRAY_SIZE(chip->saved_regs), return); |
929 | for (; num_regs; num_regs--) | 929 | for (; num_regs; num_regs--) |
930 | *val++ = snd_sbmixer_read(chip, *regs++); | 930 | *val++ = snd_sbmixer_read(chip, *regs++); |
931 | } | 931 | } |
@@ -933,7 +933,7 @@ static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs) | |||
933 | static void restore_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs) | 933 | static void restore_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs) |
934 | { | 934 | { |
935 | unsigned char *val = chip->saved_regs; | 935 | unsigned char *val = chip->saved_regs; |
936 | snd_assert(num_regs > ARRAY_SIZE(chip->saved_regs), return); | 936 | snd_assert(num_regs <= ARRAY_SIZE(chip->saved_regs), return); |
937 | for (; num_regs; num_regs--) | 937 | for (; num_regs; num_regs--) |
938 | snd_sbmixer_write(chip, *regs++, *val++); | 938 | snd_sbmixer_write(chip, *regs++, *val++); |
939 | } | 939 | } |
diff --git a/sound/pci/aw2/aw2-alsa.c b/sound/pci/aw2/aw2-alsa.c index 56f87cd33c19..3f00ddf450f8 100644 --- a/sound/pci/aw2/aw2-alsa.c +++ b/sound/pci/aw2/aw2-alsa.c | |||
@@ -316,6 +316,8 @@ static int __devinit snd_aw2_create(struct snd_card *card, | |||
316 | return -ENOMEM; | 316 | return -ENOMEM; |
317 | } | 317 | } |
318 | 318 | ||
319 | /* (2) initialization of the chip hardware */ | ||
320 | snd_aw2_saa7146_setup(&chip->saa7146, chip->iobase_virt); | ||
319 | 321 | ||
320 | if (request_irq(pci->irq, snd_aw2_saa7146_interrupt, | 322 | if (request_irq(pci->irq, snd_aw2_saa7146_interrupt, |
321 | IRQF_SHARED, "Audiowerk2", chip)) { | 323 | IRQF_SHARED, "Audiowerk2", chip)) { |
@@ -329,8 +331,6 @@ static int __devinit snd_aw2_create(struct snd_card *card, | |||
329 | } | 331 | } |
330 | chip->irq = pci->irq; | 332 | chip->irq = pci->irq; |
331 | 333 | ||
332 | /* (2) initialization of the chip hardware */ | ||
333 | snd_aw2_saa7146_setup(&chip->saa7146, chip->iobase_virt); | ||
334 | err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops); | 334 | err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops); |
335 | if (err < 0) { | 335 | if (err < 0) { |
336 | free_irq(chip->irq, (void *)chip); | 336 | free_irq(chip->irq, (void *)chip); |
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 98778cb69c6e..1dcf9f3d1107 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c | |||
@@ -269,28 +269,9 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) | |||
269 | } | 269 | } |
270 | } | 270 | } |
271 | 271 | ||
272 | static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector) | 272 | static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi) |
273 | { | 273 | { |
274 | int i; | ||
275 | |||
276 | for (i = 0; i < IOAPIC_NUM_PINS; i++) | ||
277 | if (ioapic->redirtbl[i].fields.vector == vector) | ||
278 | return i; | ||
279 | return -1; | ||
280 | } | ||
281 | |||
282 | void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) | ||
283 | { | ||
284 | struct kvm_ioapic *ioapic = kvm->arch.vioapic; | ||
285 | union ioapic_redir_entry *ent; | 274 | union ioapic_redir_entry *ent; |
286 | int gsi; | ||
287 | |||
288 | gsi = get_eoi_gsi(ioapic, vector); | ||
289 | if (gsi == -1) { | ||
290 | printk(KERN_WARNING "Can't find redir item for %d EOI\n", | ||
291 | vector); | ||
292 | return; | ||
293 | } | ||
294 | 275 | ||
295 | ent = &ioapic->redirtbl[gsi]; | 276 | ent = &ioapic->redirtbl[gsi]; |
296 | ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); | 277 | ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); |
@@ -300,6 +281,16 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) | |||
300 | ioapic_deliver(ioapic, gsi); | 281 | ioapic_deliver(ioapic, gsi); |
301 | } | 282 | } |
302 | 283 | ||
284 | void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) | ||
285 | { | ||
286 | struct kvm_ioapic *ioapic = kvm->arch.vioapic; | ||
287 | int i; | ||
288 | |||
289 | for (i = 0; i < IOAPIC_NUM_PINS; i++) | ||
290 | if (ioapic->redirtbl[i].fields.vector == vector) | ||
291 | __kvm_ioapic_update_eoi(ioapic, i); | ||
292 | } | ||
293 | |||
303 | static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr) | 294 | static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr) |
304 | { | 295 | { |
305 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | 296 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; |