aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2009-08-14 01:41:02 -0400
committerTejun Heo <tj@kernel.org>2009-08-14 01:45:31 -0400
commit384be2b18a5f9475eab9ca2bdfa95cc1a04ef59c (patch)
tree04c93f391a1b65c8bf8d7ba8643c07d26c26590a /arch/x86
parenta76761b621bcd8336065c4fe3a74f046858bc34c (diff)
parent142d44b0dd6741a64a7bdbe029110e7c1dcf1d23 (diff)
Merge branch 'percpu-for-linus' into percpu-for-next
Conflicts: arch/sparc/kernel/smp_64.c arch/x86/kernel/cpu/perf_counter.c arch/x86/kernel/setup_percpu.c drivers/cpufreq/cpufreq_ondemand.c mm/percpu.c Conflicts in core and arch percpu codes are mostly from commit ed78e1e078dd44249f88b1dd8c76dafb39567161 which substituted many num_possible_cpus() with nr_cpu_ids. As for-next branch has moved all the first chunk allocators into mm/percpu.c, the changes are moved from arch code to mm/percpu.c. Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig14
-rw-r--r--arch/x86/boot/video-bios.c3
-rw-r--r--arch/x86/boot/video-vesa.c4
-rw-r--r--arch/x86/include/asm/atomic_32.h185
-rw-r--r--arch/x86/include/asm/atomic_64.h42
-rw-r--r--arch/x86/include/asm/efi.h5
-rw-r--r--arch/x86/include/asm/fixmap.h10
-rw-r--r--arch/x86/include/asm/io_apic.h2
-rw-r--r--arch/x86/include/asm/irqflags.h8
-rw-r--r--arch/x86/include/asm/lguest.h3
-rw-r--r--arch/x86/include/asm/lguest_hcall.h18
-rw-r--r--arch/x86/include/asm/msr-index.h4
-rw-r--r--arch/x86/include/asm/nmi.h1
-rw-r--r--arch/x86/include/asm/pgalloc.h25
-rw-r--r--arch/x86/include/asm/spinlock.h4
-rw-r--r--arch/x86/include/asm/stacktrace.h2
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/include/asm/uaccess.h4
-rw-r--r--arch/x86/include/asm/uaccess_64.h10
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h9
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/amd_iommu.c4
-rw-r--r--arch/x86/kernel/amd_iommu_init.c13
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/es7000_32.c3
-rw-r--r--arch/x86/kernel/apic/io_apic.c47
-rw-r--r--arch/x86/kernel/apic/numaq_32.c3
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c38
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/common.c48
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c32
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c18
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c18
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c289
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c5
-rw-r--r--arch/x86/kernel/dumpstack_32.c6
-rw-r--r--arch/x86/kernel/dumpstack_64.c22
-rw-r--r--arch/x86/kernel/e820.c7
-rw-r--r--arch/x86/kernel/efi.c4
-rw-r--r--arch/x86/kernel/efi_64.c6
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/mfgpt_32.c2
-rw-r--r--arch/x86/kernel/pci-gart_64.c2
-rw-r--r--arch/x86/kernel/pvclock.c2
-rw-r--r--arch/x86/kernel/reboot.c50
-rw-r--r--arch/x86/kernel/setup.c13
-rw-r--r--arch/x86/kernel/setup_percpu.c2
-rw-r--r--arch/x86/kernel/tsc.c29
-rw-r--r--arch/x86/kernel/vmi_32.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S23
-rw-r--r--arch/x86/kvm/i8254.c3
-rw-r--r--arch/x86/kvm/mmu.c48
-rw-r--r--arch/x86/kvm/svm.c6
-rw-r--r--arch/x86/kvm/vmx.c6
-rw-r--r--arch/x86/kvm/x86.c44
-rw-r--r--arch/x86/lguest/boot.c510
-rw-r--r--arch/x86/lguest/i386_head.S112
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/atomic64_32.c230
-rw-r--r--arch/x86/lib/clear_page_64.S5
-rw-r--r--arch/x86/lib/copy_user_64.S1
-rw-r--r--arch/x86/lib/msr.c26
-rw-r--r--arch/x86/lib/usercopy_32.c2
-rw-r--r--arch/x86/mm/fault.c11
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--arch/x86/mm/init.c1
-rw-r--r--arch/x86/mm/init_64.c11
-rw-r--r--arch/x86/mm/pageattr.c39
-rw-r--r--arch/x86/mm/pgtable.c7
-rw-r--r--arch/x86/mm/srat_64.c6
-rw-r--r--arch/x86/oprofile/nmi_int.c2
-rw-r--r--arch/x86/pci/acpi.c59
-rw-r--r--arch/x86/pci/amd_bus.c8
-rw-r--r--arch/x86/pci/i386.c7
-rw-r--r--arch/x86/power/Makefile2
81 files changed, 1540 insertions, 699 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 646fcabb0ad7..e06b2eeff9f2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,6 +24,7 @@ config X86
24 select HAVE_UNSTABLE_SCHED_CLOCK 24 select HAVE_UNSTABLE_SCHED_CLOCK
25 select HAVE_IDE 25 select HAVE_IDE
26 select HAVE_OPROFILE 26 select HAVE_OPROFILE
27 select HAVE_PERF_COUNTERS if (!M386 && !M486)
27 select HAVE_IOREMAP_PROT 28 select HAVE_IOREMAP_PROT
28 select HAVE_KPROBES 29 select HAVE_KPROBES
29 select ARCH_WANT_OPTIONAL_GPIOLIB 30 select ARCH_WANT_OPTIONAL_GPIOLIB
@@ -739,7 +740,6 @@ config X86_UP_IOAPIC
739config X86_LOCAL_APIC 740config X86_LOCAL_APIC
740 def_bool y 741 def_bool y
741 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC 742 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
742 select HAVE_PERF_COUNTERS if (!M386 && !M486)
743 743
744config X86_IO_APIC 744config X86_IO_APIC
745 def_bool y 745 def_bool y
@@ -1910,6 +1910,18 @@ config DMAR_DEFAULT_ON
1910 recommended you say N here while the DMAR code remains 1910 recommended you say N here while the DMAR code remains
1911 experimental. 1911 experimental.
1912 1912
1913config DMAR_BROKEN_GFX_WA
1914 def_bool n
1915 prompt "Workaround broken graphics drivers (going away soon)"
1916 depends on DMAR
1917 ---help---
1918 Current Graphics drivers tend to use physical address
1919 for DMA and avoid using DMA APIs. Setting this config
1920 option permits the IOMMU driver to set a unity map for
1921 all the OS-visible memory. Hence the driver can continue
1922 to use physical addresses for DMA, at least until this
1923 option is removed in the 2.6.32 kernel.
1924
1913config DMAR_FLOPPY_WA 1925config DMAR_FLOPPY_WA
1914 def_bool y 1926 def_bool y
1915 depends on DMAR 1927 depends on DMAR
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index d660be492363..49e0c18833e0 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -37,14 +37,13 @@ static int set_bios_mode(u8 mode)
37 ireg.al = mode; /* AH=0x00 Set Video Mode */ 37 ireg.al = mode; /* AH=0x00 Set Video Mode */
38 intcall(0x10, &ireg, NULL); 38 intcall(0x10, &ireg, NULL);
39 39
40
41 ireg.ah = 0x0f; /* Get Current Video Mode */ 40 ireg.ah = 0x0f; /* Get Current Video Mode */
42 intcall(0x10, &ireg, &oreg); 41 intcall(0x10, &ireg, &oreg);
43 42
44 do_restore = 1; /* Assume video contents were lost */ 43 do_restore = 1; /* Assume video contents were lost */
45 44
46 /* Not all BIOSes are clean with the top bit */ 45 /* Not all BIOSes are clean with the top bit */
47 new_mode = ireg.al & 0x7f; 46 new_mode = oreg.al & 0x7f;
48 47
49 if (new_mode == mode) 48 if (new_mode == mode)
50 return 0; /* Mode change OK */ 49 return 0; /* Mode change OK */
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index c700147d6ffb..275dd177f198 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -45,7 +45,7 @@ static int vesa_probe(void)
45 ireg.di = (size_t)&vginfo; 45 ireg.di = (size_t)&vginfo;
46 intcall(0x10, &ireg, &oreg); 46 intcall(0x10, &ireg, &oreg);
47 47
48 if (ireg.ax != 0x004f || 48 if (oreg.ax != 0x004f ||
49 vginfo.signature != VESA_MAGIC || 49 vginfo.signature != VESA_MAGIC ||
50 vginfo.version < 0x0102) 50 vginfo.version < 0x0102)
51 return 0; /* Not present */ 51 return 0; /* Not present */
@@ -70,7 +70,7 @@ static int vesa_probe(void)
70 ireg.di = (size_t)&vminfo; 70 ireg.di = (size_t)&vminfo;
71 intcall(0x10, &ireg, &oreg); 71 intcall(0x10, &ireg, &oreg);
72 72
73 if (ireg.ax != 0x004f) 73 if (oreg.ax != 0x004f)
74 continue; 74 continue;
75 75
76 if ((vminfo.mode_attr & 0x15) == 0x05) { 76 if ((vminfo.mode_attr & 0x15) == 0x05) {
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 2503d4e64c2a..dc5a667ff791 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -19,7 +19,10 @@
19 * 19 *
20 * Atomically reads the value of @v. 20 * Atomically reads the value of @v.
21 */ 21 */
22#define atomic_read(v) ((v)->counter) 22static inline int atomic_read(const atomic_t *v)
23{
24 return v->counter;
25}
23 26
24/** 27/**
25 * atomic_set - set atomic variable 28 * atomic_set - set atomic variable
@@ -28,7 +31,10 @@
28 * 31 *
29 * Atomically sets the value of @v to @i. 32 * Atomically sets the value of @v to @i.
30 */ 33 */
31#define atomic_set(v, i) (((v)->counter) = (i)) 34static inline void atomic_set(atomic_t *v, int i)
35{
36 v->counter = i;
37}
32 38
33/** 39/**
34 * atomic_add - add integer to atomic variable 40 * atomic_add - add integer to atomic variable
@@ -200,8 +206,15 @@ static inline int atomic_sub_return(int i, atomic_t *v)
200 return atomic_add_return(-i, v); 206 return atomic_add_return(-i, v);
201} 207}
202 208
203#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) 209static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
204#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) 210{
211 return cmpxchg(&v->counter, old, new);
212}
213
214static inline int atomic_xchg(atomic_t *v, int new)
215{
216 return xchg(&v->counter, new);
217}
205 218
206/** 219/**
207 * atomic_add_unless - add unless the number is already a given value 220 * atomic_add_unless - add unless the number is already a given value
@@ -250,45 +263,12 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
250/* An 64bit atomic type */ 263/* An 64bit atomic type */
251 264
252typedef struct { 265typedef struct {
253 unsigned long long counter; 266 u64 __aligned(8) counter;
254} atomic64_t; 267} atomic64_t;
255 268
256#define ATOMIC64_INIT(val) { (val) } 269#define ATOMIC64_INIT(val) { (val) }
257 270
258/** 271extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
259 * atomic64_read - read atomic64 variable
260 * @ptr: pointer of type atomic64_t
261 *
262 * Atomically reads the value of @v.
263 * Doesn't imply a read memory barrier.
264 */
265#define __atomic64_read(ptr) ((ptr)->counter)
266
267static inline unsigned long long
268cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
269{
270 asm volatile(
271
272 LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
273
274 : "=A" (old)
275
276 : [ptr] "D" (ptr),
277 "A" (old),
278 "b" (ll_low(new)),
279 "c" (ll_high(new))
280
281 : "memory");
282
283 return old;
284}
285
286static inline unsigned long long
287atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
288 unsigned long long new_val)
289{
290 return cmpxchg8b(&ptr->counter, old_val, new_val);
291}
292 272
293/** 273/**
294 * atomic64_xchg - xchg atomic64 variable 274 * atomic64_xchg - xchg atomic64 variable
@@ -298,18 +278,7 @@ atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
298 * Atomically xchgs the value of @ptr to @new_val and returns 278 * Atomically xchgs the value of @ptr to @new_val and returns
299 * the old value. 279 * the old value.
300 */ 280 */
301 281extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
302static inline unsigned long long
303atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
304{
305 unsigned long long old_val;
306
307 do {
308 old_val = atomic_read(ptr);
309 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
310
311 return old_val;
312}
313 282
314/** 283/**
315 * atomic64_set - set atomic64 variable 284 * atomic64_set - set atomic64 variable
@@ -318,10 +287,7 @@ atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
318 * 287 *
319 * Atomically sets the value of @ptr to @new_val. 288 * Atomically sets the value of @ptr to @new_val.
320 */ 289 */
321static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) 290extern void atomic64_set(atomic64_t *ptr, u64 new_val);
322{
323 atomic64_xchg(ptr, new_val);
324}
325 291
326/** 292/**
327 * atomic64_read - read atomic64 variable 293 * atomic64_read - read atomic64 variable
@@ -329,17 +295,30 @@ static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
329 * 295 *
330 * Atomically reads the value of @ptr and returns it. 296 * Atomically reads the value of @ptr and returns it.
331 */ 297 */
332static inline unsigned long long atomic64_read(atomic64_t *ptr) 298static inline u64 atomic64_read(atomic64_t *ptr)
333{ 299{
334 unsigned long long curr_val; 300 u64 res;
335 301
336 do { 302 /*
337 curr_val = __atomic64_read(ptr); 303 * Note, we inline this atomic64_t primitive because
338 } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); 304 * it only clobbers EAX/EDX and leaves the others
339 305 * untouched. We also (somewhat subtly) rely on the
340 return curr_val; 306 * fact that cmpxchg8b returns the current 64-bit value
307 * of the memory location we are touching:
308 */
309 asm volatile(
310 "mov %%ebx, %%eax\n\t"
311 "mov %%ecx, %%edx\n\t"
312 LOCK_PREFIX "cmpxchg8b %1\n"
313 : "=&A" (res)
314 : "m" (*ptr)
315 );
316
317 return res;
341} 318}
342 319
320extern u64 atomic64_read(atomic64_t *ptr);
321
343/** 322/**
344 * atomic64_add_return - add and return 323 * atomic64_add_return - add and return
345 * @delta: integer value to add 324 * @delta: integer value to add
@@ -347,34 +326,14 @@ static inline unsigned long long atomic64_read(atomic64_t *ptr)
347 * 326 *
348 * Atomically adds @delta to @ptr and returns @delta + *@ptr 327 * Atomically adds @delta to @ptr and returns @delta + *@ptr
349 */ 328 */
350static inline unsigned long long 329extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
351atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
352{
353 unsigned long long old_val, new_val;
354
355 do {
356 old_val = atomic_read(ptr);
357 new_val = old_val + delta;
358
359 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
360
361 return new_val;
362}
363
364static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
365{
366 return atomic64_add_return(-delta, ptr);
367}
368 330
369static inline long atomic64_inc_return(atomic64_t *ptr) 331/*
370{ 332 * Other variants with different arithmetic operators:
371 return atomic64_add_return(1, ptr); 333 */
372} 334extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
373 335extern u64 atomic64_inc_return(atomic64_t *ptr);
374static inline long atomic64_dec_return(atomic64_t *ptr) 336extern u64 atomic64_dec_return(atomic64_t *ptr);
375{
376 return atomic64_sub_return(1, ptr);
377}
378 337
379/** 338/**
380 * atomic64_add - add integer to atomic64 variable 339 * atomic64_add - add integer to atomic64 variable
@@ -383,10 +342,7 @@ static inline long atomic64_dec_return(atomic64_t *ptr)
383 * 342 *
384 * Atomically adds @delta to @ptr. 343 * Atomically adds @delta to @ptr.
385 */ 344 */
386static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) 345extern void atomic64_add(u64 delta, atomic64_t *ptr);
387{
388 atomic64_add_return(delta, ptr);
389}
390 346
391/** 347/**
392 * atomic64_sub - subtract the atomic64 variable 348 * atomic64_sub - subtract the atomic64 variable
@@ -395,10 +351,7 @@ static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
395 * 351 *
396 * Atomically subtracts @delta from @ptr. 352 * Atomically subtracts @delta from @ptr.
397 */ 353 */
398static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) 354extern void atomic64_sub(u64 delta, atomic64_t *ptr);
399{
400 atomic64_add(-delta, ptr);
401}
402 355
403/** 356/**
404 * atomic64_sub_and_test - subtract value from variable and test result 357 * atomic64_sub_and_test - subtract value from variable and test result
@@ -409,13 +362,7 @@ static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
409 * true if the result is zero, or false for all 362 * true if the result is zero, or false for all
410 * other cases. 363 * other cases.
411 */ 364 */
412static inline int 365extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
413atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
414{
415 unsigned long long old_val = atomic64_sub_return(delta, ptr);
416
417 return old_val == 0;
418}
419 366
420/** 367/**
421 * atomic64_inc - increment atomic64 variable 368 * atomic64_inc - increment atomic64 variable
@@ -423,10 +370,7 @@ atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
423 * 370 *
424 * Atomically increments @ptr by 1. 371 * Atomically increments @ptr by 1.
425 */ 372 */
426static inline void atomic64_inc(atomic64_t *ptr) 373extern void atomic64_inc(atomic64_t *ptr);
427{
428 atomic64_add(1, ptr);
429}
430 374
431/** 375/**
432 * atomic64_dec - decrement atomic64 variable 376 * atomic64_dec - decrement atomic64 variable
@@ -434,10 +378,7 @@ static inline void atomic64_inc(atomic64_t *ptr)
434 * 378 *
435 * Atomically decrements @ptr by 1. 379 * Atomically decrements @ptr by 1.
436 */ 380 */
437static inline void atomic64_dec(atomic64_t *ptr) 381extern void atomic64_dec(atomic64_t *ptr);
438{
439 atomic64_sub(1, ptr);
440}
441 382
442/** 383/**
443 * atomic64_dec_and_test - decrement and test 384 * atomic64_dec_and_test - decrement and test
@@ -447,10 +388,7 @@ static inline void atomic64_dec(atomic64_t *ptr)
447 * returns true if the result is 0, or false for all other 388 * returns true if the result is 0, or false for all other
448 * cases. 389 * cases.
449 */ 390 */
450static inline int atomic64_dec_and_test(atomic64_t *ptr) 391extern int atomic64_dec_and_test(atomic64_t *ptr);
451{
452 return atomic64_sub_and_test(1, ptr);
453}
454 392
455/** 393/**
456 * atomic64_inc_and_test - increment and test 394 * atomic64_inc_and_test - increment and test
@@ -460,10 +398,7 @@ static inline int atomic64_dec_and_test(atomic64_t *ptr)
460 * and returns true if the result is zero, or false for all 398 * and returns true if the result is zero, or false for all
461 * other cases. 399 * other cases.
462 */ 400 */
463static inline int atomic64_inc_and_test(atomic64_t *ptr) 401extern int atomic64_inc_and_test(atomic64_t *ptr);
464{
465 return atomic64_sub_and_test(-1, ptr);
466}
467 402
468/** 403/**
469 * atomic64_add_negative - add and test if negative 404 * atomic64_add_negative - add and test if negative
@@ -474,13 +409,7 @@ static inline int atomic64_inc_and_test(atomic64_t *ptr)
474 * if the result is negative, or false when 409 * if the result is negative, or false when
475 * result is greater than or equal to zero. 410 * result is greater than or equal to zero.
476 */ 411 */
477static inline int 412extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
478atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
479{
480 long long old_val = atomic64_add_return(delta, ptr);
481
482 return old_val < 0;
483}
484 413
485#include <asm-generic/atomic-long.h> 414#include <asm-generic/atomic-long.h>
486#endif /* _ASM_X86_ATOMIC_32_H */ 415#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
index 0d6360220007..d605dc268e79 100644
--- a/arch/x86/include/asm/atomic_64.h
+++ b/arch/x86/include/asm/atomic_64.h
@@ -18,7 +18,10 @@
18 * 18 *
19 * Atomically reads the value of @v. 19 * Atomically reads the value of @v.
20 */ 20 */
21#define atomic_read(v) ((v)->counter) 21static inline int atomic_read(const atomic_t *v)
22{
23 return v->counter;
24}
22 25
23/** 26/**
24 * atomic_set - set atomic variable 27 * atomic_set - set atomic variable
@@ -27,7 +30,10 @@
27 * 30 *
28 * Atomically sets the value of @v to @i. 31 * Atomically sets the value of @v to @i.
29 */ 32 */
30#define atomic_set(v, i) (((v)->counter) = (i)) 33static inline void atomic_set(atomic_t *v, int i)
34{
35 v->counter = i;
36}
31 37
32/** 38/**
33 * atomic_add - add integer to atomic variable 39 * atomic_add - add integer to atomic variable
@@ -192,7 +198,10 @@ static inline int atomic_sub_return(int i, atomic_t *v)
192 * Atomically reads the value of @v. 198 * Atomically reads the value of @v.
193 * Doesn't imply a read memory barrier. 199 * Doesn't imply a read memory barrier.
194 */ 200 */
195#define atomic64_read(v) ((v)->counter) 201static inline long atomic64_read(const atomic64_t *v)
202{
203 return v->counter;
204}
196 205
197/** 206/**
198 * atomic64_set - set atomic64 variable 207 * atomic64_set - set atomic64 variable
@@ -201,7 +210,10 @@ static inline int atomic_sub_return(int i, atomic_t *v)
201 * 210 *
202 * Atomically sets the value of @v to @i. 211 * Atomically sets the value of @v to @i.
203 */ 212 */
204#define atomic64_set(v, i) (((v)->counter) = (i)) 213static inline void atomic64_set(atomic64_t *v, long i)
214{
215 v->counter = i;
216}
205 217
206/** 218/**
207 * atomic64_add - add integer to atomic64 variable 219 * atomic64_add - add integer to atomic64 variable
@@ -355,11 +367,25 @@ static inline long atomic64_sub_return(long i, atomic64_t *v)
355#define atomic64_inc_return(v) (atomic64_add_return(1, (v))) 367#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
356#define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) 368#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
357 369
358#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) 370static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
359#define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) 371{
372 return cmpxchg(&v->counter, old, new);
373}
374
375static inline long atomic64_xchg(atomic64_t *v, long new)
376{
377 return xchg(&v->counter, new);
378}
360 379
361#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) 380static inline long atomic_cmpxchg(atomic_t *v, int old, int new)
362#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) 381{
382 return cmpxchg(&v->counter, old, new);
383}
384
385static inline long atomic_xchg(atomic_t *v, int new)
386{
387 return xchg(&v->counter, new);
388}
363 389
364/** 390/**
365 * atomic_add_unless - add unless the number is a given value 391 * atomic_add_unless - add unless the number is a given value
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index edc90f23e708..8406ed7f9926 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -33,7 +33,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
33#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ 33#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
34 efi_call_virt(f, a1, a2, a3, a4, a5, a6) 34 efi_call_virt(f, a1, a2, a3, a4, a5, a6)
35 35
36#define efi_ioremap(addr, size) ioremap_cache(addr, size) 36#define efi_ioremap(addr, size, type) ioremap_cache(addr, size)
37 37
38#else /* !CONFIG_X86_32 */ 38#else /* !CONFIG_X86_32 */
39 39
@@ -84,7 +84,8 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
84 efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ 84 efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
85 (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) 85 (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
86 86
87extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size); 87extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
88 u32 type);
88 89
89#endif /* CONFIG_X86_32 */ 90#endif /* CONFIG_X86_32 */
90 91
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 2d81af3974a0..7b2d71df39a6 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -111,12 +111,9 @@ enum fixed_addresses {
111#ifdef CONFIG_PARAVIRT 111#ifdef CONFIG_PARAVIRT
112 FIX_PARAVIRT_BOOTMAP, 112 FIX_PARAVIRT_BOOTMAP,
113#endif 113#endif
114 FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */ 114 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
115 FIX_TEXT_POKE1, 115 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
116 __end_of_permanent_fixed_addresses, 116 __end_of_permanent_fixed_addresses,
117#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
118 FIX_OHCI1394_BASE,
119#endif
120 /* 117 /*
121 * 256 temporary boot-time mappings, used by early_ioremap(), 118 * 256 temporary boot-time mappings, used by early_ioremap(),
122 * before ioremap() is functional. 119 * before ioremap() is functional.
@@ -129,6 +126,9 @@ enum fixed_addresses {
129 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - 126 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
130 (__end_of_permanent_fixed_addresses & 255), 127 (__end_of_permanent_fixed_addresses & 255),
131 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, 128 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
129#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
130 FIX_OHCI1394_BASE,
131#endif
132#ifdef CONFIG_X86_32 132#ifdef CONFIG_X86_32
133 FIX_WP_TEST, 133 FIX_WP_TEST,
134#endif 134#endif
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index daf866ed0612..330ee807f89e 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -161,6 +161,7 @@ extern int io_apic_set_pci_routing(struct device *dev, int irq,
161 struct io_apic_irq_attr *irq_attr); 161 struct io_apic_irq_attr *irq_attr);
162extern int (*ioapic_renumber_irq)(int ioapic, int irq); 162extern int (*ioapic_renumber_irq)(int ioapic, int irq);
163extern void ioapic_init_mappings(void); 163extern void ioapic_init_mappings(void);
164extern void ioapic_insert_resources(void);
164 165
165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); 166extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
166extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); 167extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
@@ -180,6 +181,7 @@ extern void ioapic_write_entry(int apic, int pin,
180#define io_apic_assign_pci_irqs 0 181#define io_apic_assign_pci_irqs 0
181static const int timer_through_8259 = 0; 182static const int timer_through_8259 = 0;
182static inline void ioapic_init_mappings(void) { } 183static inline void ioapic_init_mappings(void) { }
184static inline void ioapic_insert_resources(void) { }
183 185
184static inline void probe_nr_irqs_gsi(void) { } 186static inline void probe_nr_irqs_gsi(void) { }
185#endif 187#endif
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 2bdab21f0898..c6ccbe7e81ad 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -12,9 +12,15 @@ static inline unsigned long native_save_fl(void)
12{ 12{
13 unsigned long flags; 13 unsigned long flags;
14 14
15 /*
16 * Note: this needs to be "=r" not "=rm", because we have the
17 * stack offset from what gcc expects at the time the "pop" is
18 * executed, and so a memory reference with respect to the stack
19 * would end up using the wrong address.
20 */
15 asm volatile("# __raw_save_flags\n\t" 21 asm volatile("# __raw_save_flags\n\t"
16 "pushf ; pop %0" 22 "pushf ; pop %0"
17 : "=g" (flags) 23 : "=r" (flags)
18 : /* no input */ 24 : /* no input */
19 : "memory"); 25 : "memory");
20 26
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 313389cd50d2..5136dad57cbb 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -17,8 +17,7 @@
17/* Pages for switcher itself, then two pages per cpu */ 17/* Pages for switcher itself, then two pages per cpu */
18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) 18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
19 19
20/* We map at -4M (-2M when PAE is activated) for ease of mapping 20/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */
21 * into the guest (one PTE page). */
22#ifdef CONFIG_X86_PAE 21#ifdef CONFIG_X86_PAE
23#define SWITCHER_ADDR 0xFFE00000 22#define SWITCHER_ADDR 0xFFE00000
24#else 23#else
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index d31c4a684078..ba0eed8aa1a6 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -30,27 +30,27 @@
30#include <asm/hw_irq.h> 30#include <asm/hw_irq.h>
31#include <asm/kvm_para.h> 31#include <asm/kvm_para.h>
32 32
33/*G:031 But first, how does our Guest contact the Host to ask for privileged 33/*G:030
34 * But first, how does our Guest contact the Host to ask for privileged
34 * operations? There are two ways: the direct way is to make a "hypercall", 35 * operations? There are two ways: the direct way is to make a "hypercall",
35 * to make requests of the Host Itself. 36 * to make requests of the Host Itself.
36 * 37 *
37 * We use the KVM hypercall mechanism. Seventeen hypercalls are 38 * We use the KVM hypercall mechanism, though completely different hypercall
38 * available: the hypercall number is put in the %eax register, and the 39 * numbers. Seventeen hypercalls are available: the hypercall number is put in
39 * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. 40 * the %eax register, and the arguments (when required) are placed in %ebx,
40 * If a return value makes sense, it's returned in %eax. 41 * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax.
41 * 42 *
42 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 43 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
43 * Host, rather than returning failure. This reflects Winston Churchill's 44 * Host, rather than returning failure. This reflects Winston Churchill's
44 * definition of a gentleman: "someone who is only rude intentionally". */ 45 * definition of a gentleman: "someone who is only rude intentionally".
45/*:*/ 46:*/
46 47
47/* Can't use our min() macro here: needs to be a constant */ 48/* Can't use our min() macro here: needs to be a constant */
48#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 49#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
49 50
50#define LHCALL_RING_SIZE 64 51#define LHCALL_RING_SIZE 64
51struct hcall_args { 52struct hcall_args {
52 /* These map directly onto eax, ebx, ecx, edx and esi 53 /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
53 * in struct lguest_regs */
54 unsigned long arg0, arg1, arg2, arg3, arg4; 54 unsigned long arg0, arg1, arg2, arg3, arg4;
55}; 55};
56 56
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1692fb5050e3..6be7fc254b59 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -246,10 +246,6 @@
246#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38) 246#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38)
247#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39) 247#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39)
248 248
249/* Intel Model 6 */
250#define MSR_P6_EVNTSEL0 0x00000186
251#define MSR_P6_EVNTSEL1 0x00000187
252
253/* P4/Xeon+ specific */ 249/* P4/Xeon+ specific */
254#define MSR_IA32_MCG_EAX 0x00000180 250#define MSR_IA32_MCG_EAX 0x00000180
255#define MSR_IA32_MCG_EBX 0x00000181 251#define MSR_IA32_MCG_EBX 0x00000181
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c97264409934..c86e5ed4af51 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -72,7 +72,6 @@ void lapic_watchdog_stop(void);
72int lapic_watchdog_init(unsigned nmi_hz); 72int lapic_watchdog_init(unsigned nmi_hz);
73int lapic_wd_event(unsigned nmi_hz); 73int lapic_wd_event(unsigned nmi_hz);
74unsigned lapic_adjust_nmi_hz(unsigned hz); 74unsigned lapic_adjust_nmi_hz(unsigned hz);
75int lapic_watchdog_ok(void);
76void disable_lapic_nmi_watchdog(void); 75void disable_lapic_nmi_watchdog(void);
77void enable_lapic_nmi_watchdog(void); 76void enable_lapic_nmi_watchdog(void);
78void stop_nmi(void); 77void stop_nmi(void);
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index dd14c54ac718..0e8c2a0fd922 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -46,7 +46,13 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte)
46 __free_page(pte); 46 __free_page(pte);
47} 47}
48 48
49extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte); 49extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
50
51static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
52 unsigned long address)
53{
54 ___pte_free_tlb(tlb, pte);
55}
50 56
51static inline void pmd_populate_kernel(struct mm_struct *mm, 57static inline void pmd_populate_kernel(struct mm_struct *mm,
52 pmd_t *pmd, pte_t *pte) 58 pmd_t *pmd, pte_t *pte)
@@ -78,7 +84,13 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
78 free_page((unsigned long)pmd); 84 free_page((unsigned long)pmd);
79} 85}
80 86
81extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); 87extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
88
89static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
90 unsigned long adddress)
91{
92 ___pmd_free_tlb(tlb, pmd);
93}
82 94
83#ifdef CONFIG_X86_PAE 95#ifdef CONFIG_X86_PAE
84extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); 96extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
@@ -108,7 +120,14 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
108 free_page((unsigned long)pud); 120 free_page((unsigned long)pud);
109} 121}
110 122
111extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); 123extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
124
125static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
126 unsigned long address)
127{
128 ___pud_free_tlb(tlb, pud);
129}
130
112#endif /* PAGETABLE_LEVELS > 3 */ 131#endif /* PAGETABLE_LEVELS > 3 */
113#endif /* PAGETABLE_LEVELS > 2 */ 132#endif /* PAGETABLE_LEVELS > 2 */
114 133
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index b7e5db876399..4e77853321db 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -302,4 +302,8 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
302#define _raw_read_relax(lock) cpu_relax() 302#define _raw_read_relax(lock) cpu_relax()
303#define _raw_write_relax(lock) cpu_relax() 303#define _raw_write_relax(lock) cpu_relax()
304 304
305/* The {read|write|spin}_lock() on x86 are full memory barriers. */
306static inline void smp_mb__after_lock(void) { }
307#define ARCH_HAS_SMP_MB_AFTER_LOCK
308
305#endif /* _ASM_X86_SPINLOCK_H */ 309#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index f517944b2b17..cf86a5e73815 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -3,6 +3,8 @@
3 3
4extern int kstack_depth_to_print; 4extern int kstack_depth_to_print;
5 5
6int x86_is_stack_id(int id, char *name);
7
6/* Generic stack tracer with callbacks */ 8/* Generic stack tracer with callbacks */
7 9
8struct stacktrace_ops { 10struct stacktrace_ops {
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index b0783520988b..fad7d40b75f8 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -49,7 +49,7 @@ struct thread_info {
49 .exec_domain = &default_exec_domain, \ 49 .exec_domain = &default_exec_domain, \
50 .flags = 0, \ 50 .flags = 0, \
51 .cpu = 0, \ 51 .cpu = 0, \
52 .preempt_count = 1, \ 52 .preempt_count = INIT_PREEMPT_COUNT, \
53 .addr_limit = KERNEL_DS, \ 53 .addr_limit = KERNEL_DS, \
54 .restart_block = { \ 54 .restart_block = { \
55 .fn = do_no_restart_syscall, \ 55 .fn = do_no_restart_syscall, \
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 20e6a795e160..d2c6c930b491 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -212,9 +212,9 @@ extern int __get_user_bad(void);
212 : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") 212 : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
213#else 213#else
214#define __put_user_asm_u64(x, ptr, retval, errret) \ 214#define __put_user_asm_u64(x, ptr, retval, errret) \
215 __put_user_asm(x, ptr, retval, "q", "", "Zr", errret) 215 __put_user_asm(x, ptr, retval, "q", "", "er", errret)
216#define __put_user_asm_ex_u64(x, addr) \ 216#define __put_user_asm_ex_u64(x, addr) \
217 __put_user_asm_ex(x, addr, "q", "", "Zr") 217 __put_user_asm_ex(x, addr, "q", "", "er")
218#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) 218#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
219#endif 219#endif
220 220
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 8cc687326eb8..db24b215fc50 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -88,11 +88,11 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
88 ret, "l", "k", "ir", 4); 88 ret, "l", "k", "ir", 4);
89 return ret; 89 return ret;
90 case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, 90 case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
91 ret, "q", "", "ir", 8); 91 ret, "q", "", "er", 8);
92 return ret; 92 return ret;
93 case 10: 93 case 10:
94 __put_user_asm(*(u64 *)src, (u64 __user *)dst, 94 __put_user_asm(*(u64 *)src, (u64 __user *)dst,
95 ret, "q", "", "ir", 10); 95 ret, "q", "", "er", 10);
96 if (unlikely(ret)) 96 if (unlikely(ret))
97 return ret; 97 return ret;
98 asm("":::"memory"); 98 asm("":::"memory");
@@ -101,12 +101,12 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
101 return ret; 101 return ret;
102 case 16: 102 case 16:
103 __put_user_asm(*(u64 *)src, (u64 __user *)dst, 103 __put_user_asm(*(u64 *)src, (u64 __user *)dst,
104 ret, "q", "", "ir", 16); 104 ret, "q", "", "er", 16);
105 if (unlikely(ret)) 105 if (unlikely(ret))
106 return ret; 106 return ret;
107 asm("":::"memory"); 107 asm("":::"memory");
108 __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, 108 __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
109 ret, "q", "", "ir", 8); 109 ret, "q", "", "er", 8);
110 return ret; 110 return ret;
111 default: 111 default:
112 return copy_user_generic((__force void *)dst, src, size); 112 return copy_user_generic((__force void *)dst, src, size);
@@ -157,7 +157,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
157 ret, "q", "", "=r", 8); 157 ret, "q", "", "=r", 8);
158 if (likely(!ret)) 158 if (likely(!ret))
159 __put_user_asm(tmp, (u64 __user *)dst, 159 __put_user_asm(tmp, (u64 __user *)dst,
160 ret, "q", "", "ir", 8); 160 ret, "q", "", "er", 8);
161 return ret; 161 return ret;
162 } 162 }
163 default: 163 default:
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 341070f7ad5c..77a68505419a 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -175,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
175#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) 175#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
176 176
177#define UV_GLOBAL_MMR64_PNODE_BITS(p) \ 177#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
178 ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) 178 (((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
179 179
180#define UV_APIC_PNODE_SHIFT 6 180#define UV_APIC_PNODE_SHIFT 6
181 181
@@ -327,6 +327,7 @@ struct uv_blade_info {
327 unsigned short nr_possible_cpus; 327 unsigned short nr_possible_cpus;
328 unsigned short nr_online_cpus; 328 unsigned short nr_online_cpus;
329 unsigned short pnode; 329 unsigned short pnode;
330 short memory_nid;
330}; 331};
331extern struct uv_blade_info *uv_blade_info; 332extern struct uv_blade_info *uv_blade_info;
332extern short *uv_node_to_blade; 333extern short *uv_node_to_blade;
@@ -363,6 +364,12 @@ static inline int uv_blade_to_pnode(int bid)
363 return uv_blade_info[bid].pnode; 364 return uv_blade_info[bid].pnode;
364} 365}
365 366
367/* Nid of memory node on blade. -1 if no blade-local memory */
368static inline int uv_blade_to_memory_nid(int bid)
369{
370 return uv_blade_info[bid].memory_nid;
371}
372
366/* Determine the number of possible cpus on a blade */ 373/* Determine the number of possible cpus on a blade */
367static inline int uv_blade_nr_possible_cpus(int bid) 374static inline int uv_blade_nr_possible_cpus(int bid)
368{ 375{
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 6c327b852e23..430d5b24af7b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -26,6 +26,8 @@ CFLAGS_tsc.o := $(nostackp)
26CFLAGS_paravirt.o := $(nostackp) 26CFLAGS_paravirt.o := $(nostackp)
27GCOV_PROFILE_vsyscall_64.o := n 27GCOV_PROFILE_vsyscall_64.o := n
28GCOV_PROFILE_hpet.o := n 28GCOV_PROFILE_hpet.o := n
29GCOV_PROFILE_tsc.o := n
30GCOV_PROFILE_paravirt.o := n
29 31
30obj-y := process_$(BITS).o signal.o entry_$(BITS).o 32obj-y := process_$(BITS).o signal.o entry_$(BITS).o
31obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 33obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 9372f0406ad4..6c99f5037801 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1192,7 +1192,7 @@ out:
1192 return 0; 1192 return 0;
1193} 1193}
1194 1194
1195struct notifier_block device_nb = { 1195static struct notifier_block device_nb = {
1196 .notifier_call = device_change_notifier, 1196 .notifier_call = device_change_notifier,
1197}; 1197};
1198 1198
@@ -1763,7 +1763,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
1763 flag |= __GFP_ZERO; 1763 flag |= __GFP_ZERO;
1764 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 1764 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1765 if (!virt_addr) 1765 if (!virt_addr)
1766 return 0; 1766 return NULL;
1767 1767
1768 paddr = virt_to_phys(virt_addr); 1768 paddr = virt_to_phys(virt_addr);
1769 1769
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 10b2accd12ea..c1b17e97252e 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -472,6 +472,8 @@ static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
472 if (iommu->evt_buf == NULL) 472 if (iommu->evt_buf == NULL)
473 return NULL; 473 return NULL;
474 474
475 iommu->evt_buf_size = EVT_BUFFER_SIZE;
476
475 return iommu->evt_buf; 477 return iommu->evt_buf;
476} 478}
477 479
@@ -691,6 +693,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
691 693
692 devid = e->devid; 694 devid = e->devid;
693 devid_to = e->ext >> 8; 695 devid_to = e->ext >> 8;
696 set_dev_entry_from_acpi(iommu, devid , e->flags, 0);
694 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); 697 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
695 amd_iommu_alias_table[devid] = devid_to; 698 amd_iommu_alias_table[devid] = devid_to;
696 break; 699 break;
@@ -749,11 +752,13 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
749 752
750 devid = e->devid; 753 devid = e->devid;
751 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 754 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
752 if (alias) 755 if (alias) {
753 amd_iommu_alias_table[dev_i] = devid_to; 756 amd_iommu_alias_table[dev_i] = devid_to;
754 set_dev_entry_from_acpi(iommu, 757 set_dev_entry_from_acpi(iommu,
755 amd_iommu_alias_table[dev_i], 758 devid_to, flags, ext_flags);
756 flags, ext_flags); 759 }
760 set_dev_entry_from_acpi(iommu, dev_i,
761 flags, ext_flags);
757 } 762 }
758 break; 763 break;
759 default: 764 default:
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8c7c042ecad1..0a1c2830ec66 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -140,7 +140,6 @@ int x2apic_mode;
140#ifdef CONFIG_X86_X2APIC 140#ifdef CONFIG_X86_X2APIC
141/* x2apic enabled before OS handover */ 141/* x2apic enabled before OS handover */
142static int x2apic_preenabled; 142static int x2apic_preenabled;
143static int disable_x2apic;
144static __init int setup_nox2apic(char *str) 143static __init int setup_nox2apic(char *str)
145{ 144{
146 if (x2apic_enabled()) { 145 if (x2apic_enabled()) {
@@ -149,7 +148,6 @@ static __init int setup_nox2apic(char *str)
149 return 0; 148 return 0;
150 } 149 }
151 150
152 disable_x2apic = 1;
153 setup_clear_cpu_cap(X86_FEATURE_X2APIC); 151 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
154 return 0; 152 return 0;
155} 153}
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 69328ac8de9c..8952a5890281 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -652,7 +652,8 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
652 return ret && es7000_apic_is_cluster(); 652 return ret && es7000_apic_is_cluster();
653} 653}
654 654
655struct apic apic_es7000_cluster = { 655/* We've been warned by a false positive warning.Use __refdata to keep calm. */
656struct apic __refdata apic_es7000_cluster = {
656 657
657 .name = "es7000", 658 .name = "es7000",
658 .probe = probe_es7000, 659 .probe = probe_es7000,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 4d0216fcb36c..d2ed6c5ddc80 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1716,25 +1716,19 @@ __apicdebuginit(void) print_IO_APIC(void)
1716 return; 1716 return;
1717} 1717}
1718 1718
1719__apicdebuginit(void) print_APIC_bitfield(int base) 1719__apicdebuginit(void) print_APIC_field(int base)
1720{ 1720{
1721 unsigned int v; 1721 int i;
1722 int i, j;
1723 1722
1724 if (apic_verbosity == APIC_QUIET) 1723 if (apic_verbosity == APIC_QUIET)
1725 return; 1724 return;
1726 1725
1727 printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); 1726 printk(KERN_DEBUG);
1728 for (i = 0; i < 8; i++) { 1727
1729 v = apic_read(base + i*0x10); 1728 for (i = 0; i < 8; i++)
1730 for (j = 0; j < 32; j++) { 1729 printk(KERN_CONT "%08x", apic_read(base + i*0x10));
1731 if (v & (1<<j)) 1730
1732 printk("1"); 1731 printk(KERN_CONT "\n");
1733 else
1734 printk("0");
1735 }
1736 printk("\n");
1737 }
1738} 1732}
1739 1733
1740__apicdebuginit(void) print_local_APIC(void *dummy) 1734__apicdebuginit(void) print_local_APIC(void *dummy)
@@ -1745,7 +1739,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1745 if (apic_verbosity == APIC_QUIET) 1739 if (apic_verbosity == APIC_QUIET)
1746 return; 1740 return;
1747 1741
1748 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1742 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1749 smp_processor_id(), hard_smp_processor_id()); 1743 smp_processor_id(), hard_smp_processor_id());
1750 v = apic_read(APIC_ID); 1744 v = apic_read(APIC_ID);
1751 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); 1745 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
@@ -1786,11 +1780,11 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1786 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); 1780 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1787 1781
1788 printk(KERN_DEBUG "... APIC ISR field:\n"); 1782 printk(KERN_DEBUG "... APIC ISR field:\n");
1789 print_APIC_bitfield(APIC_ISR); 1783 print_APIC_field(APIC_ISR);
1790 printk(KERN_DEBUG "... APIC TMR field:\n"); 1784 printk(KERN_DEBUG "... APIC TMR field:\n");
1791 print_APIC_bitfield(APIC_TMR); 1785 print_APIC_field(APIC_TMR);
1792 printk(KERN_DEBUG "... APIC IRR field:\n"); 1786 printk(KERN_DEBUG "... APIC IRR field:\n");
1793 print_APIC_bitfield(APIC_IRR); 1787 print_APIC_field(APIC_IRR);
1794 1788
1795 if (APIC_INTEGRATED(ver)) { /* !82489DX */ 1789 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1796 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 1790 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
@@ -3799,6 +3793,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3799 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3793 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3800 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 3794 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3801 3795
3796 if (cfg->move_in_progress)
3797 send_cleanup_vector(cfg);
3798
3802 return irq; 3799 return irq;
3803} 3800}
3804 3801
@@ -4187,28 +4184,20 @@ fake_ioapic_page:
4187 } 4184 }
4188} 4185}
4189 4186
4190static int __init ioapic_insert_resources(void) 4187void __init ioapic_insert_resources(void)
4191{ 4188{
4192 int i; 4189 int i;
4193 struct resource *r = ioapic_resources; 4190 struct resource *r = ioapic_resources;
4194 4191
4195 if (!r) { 4192 if (!r) {
4196 if (nr_ioapics > 0) { 4193 if (nr_ioapics > 0)
4197 printk(KERN_ERR 4194 printk(KERN_ERR
4198 "IO APIC resources couldn't be allocated.\n"); 4195 "IO APIC resources couldn't be allocated.\n");
4199 return -1; 4196 return;
4200 }
4201 return 0;
4202 } 4197 }
4203 4198
4204 for (i = 0; i < nr_ioapics; i++) { 4199 for (i = 0; i < nr_ioapics; i++) {
4205 insert_resource(&iomem_resource, r); 4200 insert_resource(&iomem_resource, r);
4206 r++; 4201 r++;
4207 } 4202 }
4208
4209 return 0;
4210} 4203}
4211
4212/* Insert the IO APIC resources after PCI initialization has occured to handle
4213 * IO APICS that are mapped in on a BAR in PCI space. */
4214late_initcall(ioapic_insert_resources);
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 533e59c6fc82..ca96e68f0d23 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -493,7 +493,8 @@ static void numaq_setup_portio_remap(void)
493 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); 493 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
494} 494}
495 495
496struct apic apic_numaq = { 496/* Use __refdata to keep false positive warning calm. */
497struct apic __refdata apic_numaq = {
497 498
498 .name = "NUMAQ", 499 .name = "NUMAQ",
499 .probe = probe_numaq, 500 .probe = probe_numaq,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8e4cbb255c38..a5371ec36776 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -17,11 +17,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
17 return x2apic_enabled(); 17 return x2apic_enabled();
18} 18}
19 19
20/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 20/*
21 21 * need to use more than cpu 0, because we need more vectors when
22 * MSI-X are used.
23 */
22static const struct cpumask *x2apic_target_cpus(void) 24static const struct cpumask *x2apic_target_cpus(void)
23{ 25{
24 return cpumask_of(0); 26 return cpu_online_mask;
25} 27}
26 28
27/* 29/*
@@ -170,7 +172,7 @@ static unsigned long set_apic_id(unsigned int id)
170 172
171static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) 173static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
172{ 174{
173 return current_cpu_data.initial_apicid >> index_msb; 175 return initial_apicid >> index_msb;
174} 176}
175 177
176static void x2apic_send_IPI_self(int vector) 178static void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a284359627e7..a8989aadc99a 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -27,11 +27,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
27 return 0; 27 return 0;
28} 28}
29 29
30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 30/*
31 31 * need to use more than cpu 0, because we need more vectors when
32 * MSI-X are used.
33 */
32static const struct cpumask *x2apic_target_cpus(void) 34static const struct cpumask *x2apic_target_cpus(void)
33{ 35{
34 return cpumask_of(0); 36 return cpu_online_mask;
35} 37}
36 38
37static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) 39static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
@@ -162,7 +164,7 @@ static unsigned long set_apic_id(unsigned int id)
162 164
163static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) 165static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
164{ 166{
165 return current_cpu_data.initial_apicid >> index_msb; 167 return initial_apicid >> index_msb;
166} 168}
167 169
168static void x2apic_send_IPI_self(int vector) 170static void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 096d19aea2f7..832e908adcb5 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -261,7 +261,7 @@ struct apic apic_x2apic_uv_x = {
261 .apic_id_registered = uv_apic_id_registered, 261 .apic_id_registered = uv_apic_id_registered,
262 262
263 .irq_delivery_mode = dest_Fixed, 263 .irq_delivery_mode = dest_Fixed,
264 .irq_dest_mode = 1, /* logical */ 264 .irq_dest_mode = 0, /* physical */
265 265
266 .target_cpus = uv_target_cpus, 266 .target_cpus = uv_target_cpus,
267 .disable_esr = 0, 267 .disable_esr = 0,
@@ -362,12 +362,6 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
362 BUG(); 362 BUG();
363} 363}
364 364
365static __init void map_low_mmrs(void)
366{
367 init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
368 init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
369}
370
371enum map_type {map_wb, map_uc}; 365enum map_type {map_wb, map_uc};
372 366
373static __init void map_high(char *id, unsigned long base, int shift, 367static __init void map_high(char *id, unsigned long base, int shift,
@@ -395,26 +389,6 @@ static __init void map_gru_high(int max_pnode)
395 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
396} 390}
397 391
398static __init void map_config_high(int max_pnode)
399{
400 union uvh_rh_gam_cfg_overlay_config_mmr_u cfg;
401 int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT;
402
403 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
404 if (cfg.s.enable)
405 map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc);
406}
407
408static __init void map_mmr_high(int max_pnode)
409{
410 union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
411 int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
412
413 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
414 if (mmr.s.enable)
415 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
416}
417
418static __init void map_mmioh_high(int max_pnode) 392static __init void map_mmioh_high(int max_pnode)
419{ 393{
420 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; 394 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
@@ -566,8 +540,6 @@ void __init uv_system_init(void)
566 unsigned long mmr_base, present, paddr; 540 unsigned long mmr_base, present, paddr;
567 unsigned short pnode_mask; 541 unsigned short pnode_mask;
568 542
569 map_low_mmrs();
570
571 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 543 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
572 m_val = m_n_config.s.m_skt; 544 m_val = m_n_config.s.m_skt;
573 n_val = m_n_config.s.n_skt; 545 n_val = m_n_config.s.n_skt;
@@ -591,6 +563,8 @@ void __init uv_system_init(void)
591 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 563 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
592 uv_blade_info = kmalloc(bytes, GFP_KERNEL); 564 uv_blade_info = kmalloc(bytes, GFP_KERNEL);
593 BUG_ON(!uv_blade_info); 565 BUG_ON(!uv_blade_info);
566 for (blade = 0; blade < uv_num_possible_blades(); blade++)
567 uv_blade_info[blade].memory_nid = -1;
594 568
595 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); 569 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
596 570
@@ -629,6 +603,9 @@ void __init uv_system_init(void)
629 lcpu = uv_blade_info[blade].nr_possible_cpus; 603 lcpu = uv_blade_info[blade].nr_possible_cpus;
630 uv_blade_info[blade].nr_possible_cpus++; 604 uv_blade_info[blade].nr_possible_cpus++;
631 605
606 /* Any node on the blade, else will contain -1. */
607 uv_blade_info[blade].memory_nid = nid;
608
632 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; 609 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
633 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; 610 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
634 uv_cpu_hub_info(cpu)->m_val = m_val; 611 uv_cpu_hub_info(cpu)->m_val = m_val;
@@ -662,11 +639,10 @@ void __init uv_system_init(void)
662 pnode = (paddr >> m_val) & pnode_mask; 639 pnode = (paddr >> m_val) & pnode_mask;
663 blade = boot_pnode_to_blade(pnode); 640 blade = boot_pnode_to_blade(pnode);
664 uv_node_to_blade[nid] = blade; 641 uv_node_to_blade[nid] = blade;
642 max_pnode = max(pnode, max_pnode);
665 } 643 }
666 644
667 map_gru_high(max_pnode); 645 map_gru_high(max_pnode);
668 map_mmr_high(max_pnode);
669 map_config_high(max_pnode);
670 map_mmioh_high(max_pnode); 646 map_mmioh_high(max_pnode);
671 647
672 uv_cpu_init(); 648 uv_cpu_init();
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 79302e9a33a4..442b5508893f 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -811,7 +811,7 @@ static int apm_do_idle(void)
811 u8 ret = 0; 811 u8 ret = 0;
812 int idled = 0; 812 int idled = 0;
813 int polling; 813 int polling;
814 int err; 814 int err = 0;
815 815
816 polling = !!(current_thread_info()->status & TS_POLLING); 816 polling = !!(current_thread_info()->status & TS_POLLING);
817 if (polling) { 817 if (polling) {
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 28e5f5956042..63fddcd082cd 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -356,7 +356,7 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
356#endif 356#endif
357#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) 357#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
358 /* check CPU config space for extended APIC ID */ 358 /* check CPU config space for extended APIC ID */
359 if (c->x86 >= 0xf) { 359 if (cpu_has_apic && c->x86 >= 0xf) {
360 unsigned int val; 360 unsigned int val;
361 val = read_pci_config(0, 24, 0, 0x68); 361 val = read_pci_config(0, 24, 0, 0x68);
362 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) 362 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
@@ -400,6 +400,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
400 level = cpuid_eax(1); 400 level = cpuid_eax(1);
401 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) 401 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
402 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 402 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
403
404 /*
405 * Some BIOSes incorrectly force this feature, but only K8
406 * revision D (model = 0x14) and later actually support it.
407 */
408 if (c->x86_model < 0x14)
409 clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
403 } 410 }
404 if (c->x86 == 0x10 || c->x86 == 0x11) 411 if (c->x86 == 0x10 || c->x86 == 0x11)
405 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 412 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f1961c07af9a..5ce60a88027b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -59,7 +59,30 @@ void __init setup_cpu_local_masks(void)
59 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); 59 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
60} 60}
61 61
62static const struct cpu_dev *this_cpu __cpuinitdata; 62static void __cpuinit default_init(struct cpuinfo_x86 *c)
63{
64#ifdef CONFIG_X86_64
65 display_cacheinfo(c);
66#else
67 /* Not much we can do here... */
68 /* Check if at least it has cpuid */
69 if (c->cpuid_level == -1) {
70 /* No cpuid. It must be an ancient CPU */
71 if (c->x86 == 4)
72 strcpy(c->x86_model_id, "486");
73 else if (c->x86 == 3)
74 strcpy(c->x86_model_id, "386");
75 }
76#endif
77}
78
79static const struct cpu_dev __cpuinitconst default_cpu = {
80 .c_init = default_init,
81 .c_vendor = "Unknown",
82 .c_x86_vendor = X86_VENDOR_UNKNOWN,
83};
84
85static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
63 86
64DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { 87DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
65#ifdef CONFIG_X86_64 88#ifdef CONFIG_X86_64
@@ -332,29 +355,6 @@ void switch_to_new_gdt(int cpu)
332 355
333static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; 356static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
334 357
335static void __cpuinit default_init(struct cpuinfo_x86 *c)
336{
337#ifdef CONFIG_X86_64
338 display_cacheinfo(c);
339#else
340 /* Not much we can do here... */
341 /* Check if at least it has cpuid */
342 if (c->cpuid_level == -1) {
343 /* No cpuid. It must be an ancient CPU */
344 if (c->x86 == 4)
345 strcpy(c->x86_model_id, "486");
346 else if (c->x86 == 3)
347 strcpy(c->x86_model_id, "386");
348 }
349#endif
350}
351
352static const struct cpu_dev __cpuinitconst default_cpu = {
353 .c_init = default_init,
354 .c_vendor = "Unknown",
355 .c_x86_vendor = X86_VENDOR_UNKNOWN,
356};
357
358static void __cpuinit get_model_name(struct cpuinfo_x86 *c) 358static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
359{ 359{
360 unsigned int *v; 360 unsigned int *v;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 81cbe64ed6b4..2a50ef891000 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -299,7 +299,7 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
299static int transition_fid_vid(struct powernow_k8_data *data, 299static int transition_fid_vid(struct powernow_k8_data *data,
300 u32 reqfid, u32 reqvid) 300 u32 reqfid, u32 reqvid)
301{ 301{
302 if (core_voltage_pre_transition(data, reqvid)) 302 if (core_voltage_pre_transition(data, reqvid, reqfid))
303 return 1; 303 return 1;
304 304
305 if (core_frequency_transition(data, reqfid)) 305 if (core_frequency_transition(data, reqfid))
@@ -327,17 +327,20 @@ static int transition_fid_vid(struct powernow_k8_data *data,
327 327
328/* Phase 1 - core voltage transition ... setup voltage */ 328/* Phase 1 - core voltage transition ... setup voltage */
329static int core_voltage_pre_transition(struct powernow_k8_data *data, 329static int core_voltage_pre_transition(struct powernow_k8_data *data,
330 u32 reqvid) 330 u32 reqvid, u32 reqfid)
331{ 331{
332 u32 rvosteps = data->rvo; 332 u32 rvosteps = data->rvo;
333 u32 savefid = data->currfid; 333 u32 savefid = data->currfid;
334 u32 maxvid, lo; 334 u32 maxvid, lo, rvomult = 1;
335 335
336 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " 336 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
337 "reqvid 0x%x, rvo 0x%x\n", 337 "reqvid 0x%x, rvo 0x%x\n",
338 smp_processor_id(), 338 smp_processor_id(),
339 data->currfid, data->currvid, reqvid, data->rvo); 339 data->currfid, data->currvid, reqvid, data->rvo);
340 340
341 if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
342 rvomult = 2;
343 rvosteps *= rvomult;
341 rdmsr(MSR_FIDVID_STATUS, lo, maxvid); 344 rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
342 maxvid = 0x1f & (maxvid >> 16); 345 maxvid = 0x1f & (maxvid >> 16);
343 dprintk("ph1 maxvid=0x%x\n", maxvid); 346 dprintk("ph1 maxvid=0x%x\n", maxvid);
@@ -351,7 +354,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data,
351 return 1; 354 return 1;
352 } 355 }
353 356
354 while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) { 357 while ((rvosteps > 0) &&
358 ((rvomult * data->rvo + data->currvid) > reqvid)) {
355 if (data->currvid == maxvid) { 359 if (data->currvid == maxvid) {
356 rvosteps = 0; 360 rvosteps = 0;
357 } else { 361 } else {
@@ -384,13 +388,6 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
384 u32 vcoreqfid, vcocurrfid, vcofiddiff; 388 u32 vcoreqfid, vcocurrfid, vcofiddiff;
385 u32 fid_interval, savevid = data->currvid; 389 u32 fid_interval, savevid = data->currvid;
386 390
387 if ((reqfid < HI_FID_TABLE_BOTTOM) &&
388 (data->currfid < HI_FID_TABLE_BOTTOM)) {
389 printk(KERN_ERR PFX "ph2: illegal lo-lo transition "
390 "0x%x 0x%x\n", reqfid, data->currfid);
391 return 1;
392 }
393
394 if (data->currfid == reqfid) { 391 if (data->currfid == reqfid) {
395 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", 392 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
396 data->currfid); 393 data->currfid);
@@ -407,6 +404,9 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
407 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid 404 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
408 : vcoreqfid - vcocurrfid; 405 : vcoreqfid - vcocurrfid;
409 406
407 if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
408 vcofiddiff = 0;
409
410 while (vcofiddiff > 2) { 410 while (vcofiddiff > 2) {
411 (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); 411 (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
412 412
@@ -1081,14 +1081,6 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,
1081 return 0; 1081 return 0;
1082 } 1082 }
1083 1083
1084 if ((fid < HI_FID_TABLE_BOTTOM) &&
1085 (data->currfid < HI_FID_TABLE_BOTTOM)) {
1086 printk(KERN_ERR PFX
1087 "ignoring illegal change in lo freq table-%x to 0x%x\n",
1088 data->currfid, fid);
1089 return 1;
1090 }
1091
1092 dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", 1084 dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
1093 smp_processor_id(), fid, vid); 1085 smp_processor_id(), fid, vid);
1094 freqs.old = find_khz_freq_from_fid(data->currfid); 1086 freqs.old = find_khz_freq_from_fid(data->currfid);
@@ -1267,7 +1259,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1267{ 1259{
1268 static const char ACPI_PSS_BIOS_BUG_MSG[] = 1260 static const char ACPI_PSS_BIOS_BUG_MSG[] =
1269 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" 1261 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
1270 KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; 1262 FW_BUG PFX "Try again with latest BIOS.\n";
1271 struct powernow_k8_data *data; 1263 struct powernow_k8_data *data;
1272 struct init_on_cpu init_on_cpu; 1264 struct init_on_cpu init_on_cpu;
1273 int rc; 1265 int rc;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index c9c1190b5e1f..02ce824073cb 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -215,7 +215,8 @@ struct pst_s {
215 215
216#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) 216#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
217 217
218static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid); 218static int core_voltage_pre_transition(struct powernow_k8_data *data,
219 u32 reqvid, u32 regfid);
219static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); 220static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
220static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); 221static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
221 222
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 20d498351105..14ce5d49b2ad 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -194,14 +194,14 @@ static void print_mce(struct mce *m)
194 m->cs, m->ip); 194 m->cs, m->ip);
195 if (m->cs == __KERNEL_CS) 195 if (m->cs == __KERNEL_CS)
196 print_symbol("{%s}", m->ip); 196 print_symbol("{%s}", m->ip);
197 printk("\n"); 197 printk(KERN_CONT "\n");
198 } 198 }
199 printk(KERN_EMERG "TSC %llx ", m->tsc); 199 printk(KERN_EMERG "TSC %llx ", m->tsc);
200 if (m->addr) 200 if (m->addr)
201 printk("ADDR %llx ", m->addr); 201 printk(KERN_CONT "ADDR %llx ", m->addr);
202 if (m->misc) 202 if (m->misc)
203 printk("MISC %llx ", m->misc); 203 printk(KERN_CONT "MISC %llx ", m->misc);
204 printk("\n"); 204 printk(KERN_CONT "\n");
205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 m->cpuvendor, m->cpuid, m->time, m->socketid, 206 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 m->apicid); 207 m->apicid);
@@ -209,13 +209,13 @@ static void print_mce(struct mce *m)
209 209
210static void print_mce_head(void) 210static void print_mce_head(void)
211{ 211{
212 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); 212 printk(KERN_EMERG "\nHARDWARE ERROR\n");
213} 213}
214 214
215static void print_mce_tail(void) 215static void print_mce_tail(void)
216{ 216{
217 printk(KERN_EMERG "This is not a software problem!\n" 217 printk(KERN_EMERG "This is not a software problem!\n"
218 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); 218 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
219} 219}
220 220
221#define PANIC_TIMEOUT 5 /* 5 seconds */ 221#define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -1692,17 +1692,15 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1692 const char *buf, size_t siz) 1692 const char *buf, size_t siz)
1693{ 1693{
1694 char *p; 1694 char *p;
1695 int len;
1696 1695
1697 strncpy(mce_helper, buf, sizeof(mce_helper)); 1696 strncpy(mce_helper, buf, sizeof(mce_helper));
1698 mce_helper[sizeof(mce_helper)-1] = 0; 1697 mce_helper[sizeof(mce_helper)-1] = 0;
1699 len = strlen(mce_helper);
1700 p = strchr(mce_helper, '\n'); 1698 p = strchr(mce_helper, '\n');
1701 1699
1702 if (*p) 1700 if (p)
1703 *p = 0; 1701 *p = 0;
1704 1702
1705 return len; 1703 return strlen(mce_helper) + !!p;
1706} 1704}
1707 1705
1708static ssize_t set_ignore_ce(struct sys_device *s, 1706static ssize_t set_ignore_ce(struct sys_device *s,
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index bff8dd191dd5..8bc64cfbe936 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -36,6 +36,7 @@
36 36
37static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; 37static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
38static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); 38static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
39static DEFINE_PER_CPU(bool, thermal_throttle_active);
39 40
40static atomic_t therm_throt_en = ATOMIC_INIT(0); 41static atomic_t therm_throt_en = ATOMIC_INIT(0);
41 42
@@ -96,24 +97,27 @@ static int therm_throt_process(int curr)
96{ 97{
97 unsigned int cpu = smp_processor_id(); 98 unsigned int cpu = smp_processor_id();
98 __u64 tmp_jiffs = get_jiffies_64(); 99 __u64 tmp_jiffs = get_jiffies_64();
100 bool was_throttled = __get_cpu_var(thermal_throttle_active);
101 bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr;
99 102
100 if (curr) 103 if (is_throttled)
101 __get_cpu_var(thermal_throttle_count)++; 104 __get_cpu_var(thermal_throttle_count)++;
102 105
103 if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) 106 if (!(was_throttled ^ is_throttled) &&
107 time_before64(tmp_jiffs, __get_cpu_var(next_check)))
104 return 0; 108 return 0;
105 109
106 __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; 110 __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL;
107 111
108 /* if we just entered the thermal event */ 112 /* if we just entered the thermal event */
109 if (curr) { 113 if (is_throttled) {
110 printk(KERN_CRIT "CPU%d: Temperature above threshold, " 114 printk(KERN_CRIT "CPU%d: Temperature above threshold, "
111 "cpu clock throttled (total events = %lu)\n", cpu, 115 "cpu clock throttled (total events = %lu)\n",
112 __get_cpu_var(thermal_throttle_count)); 116 cpu, __get_cpu_var(thermal_throttle_count));
113 117
114 add_taint(TAINT_MACHINE_CHECK); 118 add_taint(TAINT_MACHINE_CHECK);
115 } else { 119 } else if (was_throttled) {
116 printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); 120 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
117 } 121 }
118 122
119 return 1; 123 return 1;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 13bd6d6cf0bd..3d4ebbd2e129 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -55,6 +55,7 @@ struct x86_pmu {
55 int num_counters_fixed; 55 int num_counters_fixed;
56 int counter_bits; 56 int counter_bits;
57 u64 counter_mask; 57 u64 counter_mask;
58 int apic;
58 u64 max_period; 59 u64 max_period;
59 u64 intel_ctrl; 60 u64 intel_ctrl;
60}; 61};
@@ -66,6 +67,52 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
66}; 67};
67 68
68/* 69/*
70 * Not sure about some of these
71 */
72static const u64 p6_perfmon_event_map[] =
73{
74 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
75 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
76 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
77 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
78 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
79 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
80 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
81};
82
83static u64 p6_pmu_event_map(int event)
84{
85 return p6_perfmon_event_map[event];
86}
87
88/*
89 * Counter setting that is specified not to count anything.
90 * We use this to effectively disable a counter.
91 *
92 * L2_RQSTS with 0 MESI unit mask.
93 */
94#define P6_NOP_COUNTER 0x0000002EULL
95
96static u64 p6_pmu_raw_event(u64 event)
97{
98#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
99#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
100#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
101#define P6_EVNTSEL_INV_MASK 0x00800000ULL
102#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL
103
104#define P6_EVNTSEL_MASK \
105 (P6_EVNTSEL_EVENT_MASK | \
106 P6_EVNTSEL_UNIT_MASK | \
107 P6_EVNTSEL_EDGE_MASK | \
108 P6_EVNTSEL_INV_MASK | \
109 P6_EVNTSEL_COUNTER_MASK)
110
111 return event & P6_EVNTSEL_MASK;
112}
113
114
115/*
69 * Intel PerfMon v3. Used on Core2 and later. 116 * Intel PerfMon v3. Used on Core2 and later.
70 */ 117 */
71static const u64 intel_perfmon_event_map[] = 118static const u64 intel_perfmon_event_map[] =
@@ -567,6 +614,7 @@ static DEFINE_MUTEX(pmc_reserve_mutex);
567 614
568static bool reserve_pmc_hardware(void) 615static bool reserve_pmc_hardware(void)
569{ 616{
617#ifdef CONFIG_X86_LOCAL_APIC
570 int i; 618 int i;
571 619
572 if (nmi_watchdog == NMI_LOCAL_APIC) 620 if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -581,9 +629,11 @@ static bool reserve_pmc_hardware(void)
581 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 629 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
582 goto eventsel_fail; 630 goto eventsel_fail;
583 } 631 }
632#endif
584 633
585 return true; 634 return true;
586 635
636#ifdef CONFIG_X86_LOCAL_APIC
587eventsel_fail: 637eventsel_fail:
588 for (i--; i >= 0; i--) 638 for (i--; i >= 0; i--)
589 release_evntsel_nmi(x86_pmu.eventsel + i); 639 release_evntsel_nmi(x86_pmu.eventsel + i);
@@ -598,10 +648,12 @@ perfctr_fail:
598 enable_lapic_nmi_watchdog(); 648 enable_lapic_nmi_watchdog();
599 649
600 return false; 650 return false;
651#endif
601} 652}
602 653
603static void release_pmc_hardware(void) 654static void release_pmc_hardware(void)
604{ 655{
656#ifdef CONFIG_X86_LOCAL_APIC
605 int i; 657 int i;
606 658
607 for (i = 0; i < x86_pmu.num_counters; i++) { 659 for (i = 0; i < x86_pmu.num_counters; i++) {
@@ -611,6 +663,7 @@ static void release_pmc_hardware(void)
611 663
612 if (nmi_watchdog == NMI_LOCAL_APIC) 664 if (nmi_watchdog == NMI_LOCAL_APIC)
613 enable_lapic_nmi_watchdog(); 665 enable_lapic_nmi_watchdog();
666#endif
614} 667}
615 668
616static void hw_perf_counter_destroy(struct perf_counter *counter) 669static void hw_perf_counter_destroy(struct perf_counter *counter)
@@ -666,6 +719,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
666{ 719{
667 struct perf_counter_attr *attr = &counter->attr; 720 struct perf_counter_attr *attr = &counter->attr;
668 struct hw_perf_counter *hwc = &counter->hw; 721 struct hw_perf_counter *hwc = &counter->hw;
722 u64 config;
669 int err; 723 int err;
670 724
671 if (!x86_pmu_initialized()) 725 if (!x86_pmu_initialized())
@@ -701,6 +755,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
701 hwc->sample_period = x86_pmu.max_period; 755 hwc->sample_period = x86_pmu.max_period;
702 hwc->last_period = hwc->sample_period; 756 hwc->last_period = hwc->sample_period;
703 atomic64_set(&hwc->period_left, hwc->sample_period); 757 atomic64_set(&hwc->period_left, hwc->sample_period);
758 } else {
759 /*
760 * If we have a PMU initialized but no APIC
761 * interrupts, we cannot sample hardware
762 * counters (user-space has to fall back and
763 * sample via a hrtimer based software counter):
764 */
765 if (!x86_pmu.apic)
766 return -EOPNOTSUPP;
704 } 767 }
705 768
706 counter->destroy = hw_perf_counter_destroy; 769 counter->destroy = hw_perf_counter_destroy;
@@ -718,14 +781,40 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
718 781
719 if (attr->config >= x86_pmu.max_events) 782 if (attr->config >= x86_pmu.max_events)
720 return -EINVAL; 783 return -EINVAL;
784
721 /* 785 /*
722 * The generic map: 786 * The generic map:
723 */ 787 */
724 hwc->config |= x86_pmu.event_map(attr->config); 788 config = x86_pmu.event_map(attr->config);
789
790 if (config == 0)
791 return -ENOENT;
792
793 if (config == -1LL)
794 return -EINVAL;
795
796 hwc->config |= config;
725 797
726 return 0; 798 return 0;
727} 799}
728 800
801static void p6_pmu_disable_all(void)
802{
803 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
804 u64 val;
805
806 if (!cpuc->enabled)
807 return;
808
809 cpuc->enabled = 0;
810 barrier();
811
812 /* p6 only has one enable register */
813 rdmsrl(MSR_P6_EVNTSEL0, val);
814 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
815 wrmsrl(MSR_P6_EVNTSEL0, val);
816}
817
729static void intel_pmu_disable_all(void) 818static void intel_pmu_disable_all(void)
730{ 819{
731 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 820 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
@@ -767,6 +856,23 @@ void hw_perf_disable(void)
767 return x86_pmu.disable_all(); 856 return x86_pmu.disable_all();
768} 857}
769 858
859static void p6_pmu_enable_all(void)
860{
861 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
862 unsigned long val;
863
864 if (cpuc->enabled)
865 return;
866
867 cpuc->enabled = 1;
868 barrier();
869
870 /* p6 only has one enable register */
871 rdmsrl(MSR_P6_EVNTSEL0, val);
872 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
873 wrmsrl(MSR_P6_EVNTSEL0, val);
874}
875
770static void intel_pmu_enable_all(void) 876static void intel_pmu_enable_all(void)
771{ 877{
772 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 878 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
@@ -784,13 +890,13 @@ static void amd_pmu_enable_all(void)
784 barrier(); 890 barrier();
785 891
786 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 892 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
893 struct perf_counter *counter = cpuc->counters[idx];
787 u64 val; 894 u64 val;
788 895
789 if (!test_bit(idx, cpuc->active_mask)) 896 if (!test_bit(idx, cpuc->active_mask))
790 continue; 897 continue;
791 rdmsrl(MSR_K7_EVNTSEL0 + idx, val); 898
792 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) 899 val = counter->hw.config;
793 continue;
794 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 900 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
795 wrmsrl(MSR_K7_EVNTSEL0 + idx, val); 901 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
796 } 902 }
@@ -819,16 +925,13 @@ static inline void intel_pmu_ack_status(u64 ack)
819 925
820static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 926static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
821{ 927{
822 int err; 928 (void)checking_wrmsrl(hwc->config_base + idx,
823 err = checking_wrmsrl(hwc->config_base + idx,
824 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); 929 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
825} 930}
826 931
827static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 932static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
828{ 933{
829 int err; 934 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
830 err = checking_wrmsrl(hwc->config_base + idx,
831 hwc->config);
832} 935}
833 936
834static inline void 937static inline void
@@ -836,13 +939,24 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
836{ 939{
837 int idx = __idx - X86_PMC_IDX_FIXED; 940 int idx = __idx - X86_PMC_IDX_FIXED;
838 u64 ctrl_val, mask; 941 u64 ctrl_val, mask;
839 int err;
840 942
841 mask = 0xfULL << (idx * 4); 943 mask = 0xfULL << (idx * 4);
842 944
843 rdmsrl(hwc->config_base, ctrl_val); 945 rdmsrl(hwc->config_base, ctrl_val);
844 ctrl_val &= ~mask; 946 ctrl_val &= ~mask;
845 err = checking_wrmsrl(hwc->config_base, ctrl_val); 947 (void)checking_wrmsrl(hwc->config_base, ctrl_val);
948}
949
950static inline void
951p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
952{
953 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
954 u64 val = P6_NOP_COUNTER;
955
956 if (cpuc->enabled)
957 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
958
959 (void)checking_wrmsrl(hwc->config_base + idx, val);
846} 960}
847 961
848static inline void 962static inline void
@@ -943,6 +1057,19 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
943 err = checking_wrmsrl(hwc->config_base, ctrl_val); 1057 err = checking_wrmsrl(hwc->config_base, ctrl_val);
944} 1058}
945 1059
1060static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
1061{
1062 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1063 u64 val;
1064
1065 val = hwc->config;
1066 if (cpuc->enabled)
1067 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
1068
1069 (void)checking_wrmsrl(hwc->config_base + idx, val);
1070}
1071
1072
946static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1073static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
947{ 1074{
948 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 1075 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -959,8 +1086,6 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
959 1086
960 if (cpuc->enabled) 1087 if (cpuc->enabled)
961 x86_pmu_enable_counter(hwc, idx); 1088 x86_pmu_enable_counter(hwc, idx);
962 else
963 x86_pmu_disable_counter(hwc, idx);
964} 1089}
965 1090
966static int 1091static int
@@ -1176,6 +1301,49 @@ static void intel_pmu_reset(void)
1176 local_irq_restore(flags); 1301 local_irq_restore(flags);
1177} 1302}
1178 1303
1304static int p6_pmu_handle_irq(struct pt_regs *regs)
1305{
1306 struct perf_sample_data data;
1307 struct cpu_hw_counters *cpuc;
1308 struct perf_counter *counter;
1309 struct hw_perf_counter *hwc;
1310 int idx, handled = 0;
1311 u64 val;
1312
1313 data.regs = regs;
1314 data.addr = 0;
1315
1316 cpuc = &__get_cpu_var(cpu_hw_counters);
1317
1318 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1319 if (!test_bit(idx, cpuc->active_mask))
1320 continue;
1321
1322 counter = cpuc->counters[idx];
1323 hwc = &counter->hw;
1324
1325 val = x86_perf_counter_update(counter, hwc, idx);
1326 if (val & (1ULL << (x86_pmu.counter_bits - 1)))
1327 continue;
1328
1329 /*
1330 * counter overflow
1331 */
1332 handled = 1;
1333 data.period = counter->hw.last_period;
1334
1335 if (!x86_perf_counter_set_period(counter, hwc, idx))
1336 continue;
1337
1338 if (perf_counter_overflow(counter, 1, &data))
1339 p6_pmu_disable_counter(hwc, idx);
1340 }
1341
1342 if (handled)
1343 inc_irq_stat(apic_perf_irqs);
1344
1345 return handled;
1346}
1179 1347
1180/* 1348/*
1181 * This handler is triggered by the local APIC, so the APIC IRQ handling 1349 * This handler is triggered by the local APIC, so the APIC IRQ handling
@@ -1185,14 +1353,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1185{ 1353{
1186 struct perf_sample_data data; 1354 struct perf_sample_data data;
1187 struct cpu_hw_counters *cpuc; 1355 struct cpu_hw_counters *cpuc;
1188 int bit, cpu, loops; 1356 int bit, loops;
1189 u64 ack, status; 1357 u64 ack, status;
1190 1358
1191 data.regs = regs; 1359 data.regs = regs;
1192 data.addr = 0; 1360 data.addr = 0;
1193 1361
1194 cpu = smp_processor_id(); 1362 cpuc = &__get_cpu_var(cpu_hw_counters);
1195 cpuc = &per_cpu(cpu_hw_counters, cpu);
1196 1363
1197 perf_disable(); 1364 perf_disable();
1198 status = intel_pmu_get_status(); 1365 status = intel_pmu_get_status();
@@ -1249,14 +1416,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1249 struct cpu_hw_counters *cpuc; 1416 struct cpu_hw_counters *cpuc;
1250 struct perf_counter *counter; 1417 struct perf_counter *counter;
1251 struct hw_perf_counter *hwc; 1418 struct hw_perf_counter *hwc;
1252 int cpu, idx, handled = 0; 1419 int idx, handled = 0;
1253 u64 val; 1420 u64 val;
1254 1421
1255 data.regs = regs; 1422 data.regs = regs;
1256 data.addr = 0; 1423 data.addr = 0;
1257 1424
1258 cpu = smp_processor_id(); 1425 cpuc = &__get_cpu_var(cpu_hw_counters);
1259 cpuc = &per_cpu(cpu_hw_counters, cpu);
1260 1426
1261 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1427 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1262 if (!test_bit(idx, cpuc->active_mask)) 1428 if (!test_bit(idx, cpuc->active_mask))
@@ -1299,18 +1465,22 @@ void smp_perf_pending_interrupt(struct pt_regs *regs)
1299 1465
1300void set_perf_counter_pending(void) 1466void set_perf_counter_pending(void)
1301{ 1467{
1468#ifdef CONFIG_X86_LOCAL_APIC
1302 apic->send_IPI_self(LOCAL_PENDING_VECTOR); 1469 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1470#endif
1303} 1471}
1304 1472
1305void perf_counters_lapic_init(void) 1473void perf_counters_lapic_init(void)
1306{ 1474{
1307 if (!x86_pmu_initialized()) 1475#ifdef CONFIG_X86_LOCAL_APIC
1476 if (!x86_pmu.apic || !x86_pmu_initialized())
1308 return; 1477 return;
1309 1478
1310 /* 1479 /*
1311 * Always use NMI for PMU 1480 * Always use NMI for PMU
1312 */ 1481 */
1313 apic_write(APIC_LVTPC, APIC_DM_NMI); 1482 apic_write(APIC_LVTPC, APIC_DM_NMI);
1483#endif
1314} 1484}
1315 1485
1316static int __kprobes 1486static int __kprobes
@@ -1334,7 +1504,9 @@ perf_counter_nmi_handler(struct notifier_block *self,
1334 1504
1335 regs = args->regs; 1505 regs = args->regs;
1336 1506
1507#ifdef CONFIG_X86_LOCAL_APIC
1337 apic_write(APIC_LVTPC, APIC_DM_NMI); 1508 apic_write(APIC_LVTPC, APIC_DM_NMI);
1509#endif
1338 /* 1510 /*
1339 * Can't rely on the handled return value to say it was our NMI, two 1511 * Can't rely on the handled return value to say it was our NMI, two
1340 * counters could trigger 'simultaneously' raising two back-to-back NMIs. 1512 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
@@ -1353,6 +1525,33 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
1353 .priority = 1 1525 .priority = 1
1354}; 1526};
1355 1527
1528static struct x86_pmu p6_pmu = {
1529 .name = "p6",
1530 .handle_irq = p6_pmu_handle_irq,
1531 .disable_all = p6_pmu_disable_all,
1532 .enable_all = p6_pmu_enable_all,
1533 .enable = p6_pmu_enable_counter,
1534 .disable = p6_pmu_disable_counter,
1535 .eventsel = MSR_P6_EVNTSEL0,
1536 .perfctr = MSR_P6_PERFCTR0,
1537 .event_map = p6_pmu_event_map,
1538 .raw_event = p6_pmu_raw_event,
1539 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
1540 .apic = 1,
1541 .max_period = (1ULL << 31) - 1,
1542 .version = 0,
1543 .num_counters = 2,
1544 /*
1545 * Counters have 40 bits implemented. However they are designed such
1546 * that bits [32-39] are sign extensions of bit 31. As such the
1547 * effective width of a counter for P6-like PMU is 32 bits only.
1548 *
1549 * See IA-32 Intel Architecture Software developer manual Vol 3B
1550 */
1551 .counter_bits = 32,
1552 .counter_mask = (1ULL << 32) - 1,
1553};
1554
1356static struct x86_pmu intel_pmu = { 1555static struct x86_pmu intel_pmu = {
1357 .name = "Intel", 1556 .name = "Intel",
1358 .handle_irq = intel_pmu_handle_irq, 1557 .handle_irq = intel_pmu_handle_irq,
@@ -1365,6 +1564,7 @@ static struct x86_pmu intel_pmu = {
1365 .event_map = intel_pmu_event_map, 1564 .event_map = intel_pmu_event_map,
1366 .raw_event = intel_pmu_raw_event, 1565 .raw_event = intel_pmu_raw_event,
1367 .max_events = ARRAY_SIZE(intel_perfmon_event_map), 1566 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
1567 .apic = 1,
1368 /* 1568 /*
1369 * Intel PMCs cannot be accessed sanely above 32 bit width, 1569 * Intel PMCs cannot be accessed sanely above 32 bit width,
1370 * so we install an artificial 1<<31 period regardless of 1570 * so we install an artificial 1<<31 period regardless of
@@ -1388,10 +1588,43 @@ static struct x86_pmu amd_pmu = {
1388 .num_counters = 4, 1588 .num_counters = 4,
1389 .counter_bits = 48, 1589 .counter_bits = 48,
1390 .counter_mask = (1ULL << 48) - 1, 1590 .counter_mask = (1ULL << 48) - 1,
1591 .apic = 1,
1391 /* use highest bit to detect overflow */ 1592 /* use highest bit to detect overflow */
1392 .max_period = (1ULL << 47) - 1, 1593 .max_period = (1ULL << 47) - 1,
1393}; 1594};
1394 1595
1596static int p6_pmu_init(void)
1597{
1598 switch (boot_cpu_data.x86_model) {
1599 case 1:
1600 case 3: /* Pentium Pro */
1601 case 5:
1602 case 6: /* Pentium II */
1603 case 7:
1604 case 8:
1605 case 11: /* Pentium III */
1606 break;
1607 case 9:
1608 case 13:
1609 /* Pentium M */
1610 break;
1611 default:
1612 pr_cont("unsupported p6 CPU model %d ",
1613 boot_cpu_data.x86_model);
1614 return -ENODEV;
1615 }
1616
1617 x86_pmu = p6_pmu;
1618
1619 if (!cpu_has_apic) {
1620 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1621 pr_info("no hardware sampling interrupt available.\n");
1622 x86_pmu.apic = 0;
1623 }
1624
1625 return 0;
1626}
1627
1395static int intel_pmu_init(void) 1628static int intel_pmu_init(void)
1396{ 1629{
1397 union cpuid10_edx edx; 1630 union cpuid10_edx edx;
@@ -1400,8 +1633,14 @@ static int intel_pmu_init(void)
1400 unsigned int ebx; 1633 unsigned int ebx;
1401 int version; 1634 int version;
1402 1635
1403 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 1636 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
1637 /* check for P6 processor family */
1638 if (boot_cpu_data.x86 == 6) {
1639 return p6_pmu_init();
1640 } else {
1404 return -ENODEV; 1641 return -ENODEV;
1642 }
1643 }
1405 1644
1406 /* 1645 /*
1407 * Check whether the Architectural PerfMon supports 1646 * Check whether the Architectural PerfMon supports
@@ -1561,6 +1800,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1561 1800
1562static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); 1801static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
1563static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); 1802static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
1803static DEFINE_PER_CPU(int, in_nmi_frame);
1564 1804
1565 1805
1566static void 1806static void
@@ -1576,7 +1816,9 @@ static void backtrace_warning(void *data, char *msg)
1576 1816
1577static int backtrace_stack(void *data, char *name) 1817static int backtrace_stack(void *data, char *name)
1578{ 1818{
1579 /* Process all stacks: */ 1819 per_cpu(in_nmi_frame, smp_processor_id()) =
1820 x86_is_stack_id(NMI_STACK, name);
1821
1580 return 0; 1822 return 0;
1581} 1823}
1582 1824
@@ -1584,6 +1826,9 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
1584{ 1826{
1585 struct perf_callchain_entry *entry = data; 1827 struct perf_callchain_entry *entry = data;
1586 1828
1829 if (per_cpu(in_nmi_frame, smp_processor_id()))
1830 return;
1831
1587 if (reliable) 1832 if (reliable)
1588 callchain_store(entry, addr); 1833 callchain_store(entry, addr);
1589} 1834}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 5c481f6205bf..e60ed740d2b3 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -803,8 +803,3 @@ int __kprobes lapic_wd_event(unsigned nmi_hz)
803 wd_ops->rearm(wd, nmi_hz); 803 wd_ops->rearm(wd, nmi_hz);
804 return 1; 804 return 1;
805} 805}
806
807int lapic_watchdog_ok(void)
808{
809 return wd_ops != NULL;
810}
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index d593cd1f58dc..bca5fba91c9e 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -19,6 +19,12 @@
19 19
20#include "dumpstack.h" 20#include "dumpstack.h"
21 21
22/* Just a stub for now */
23int x86_is_stack_id(int id, char *name)
24{
25 return 0;
26}
27
22void dump_trace(struct task_struct *task, struct pt_regs *regs, 28void dump_trace(struct task_struct *task, struct pt_regs *regs,
23 unsigned long *stack, unsigned long bp, 29 unsigned long *stack, unsigned long bp,
24 const struct stacktrace_ops *ops, void *data) 30 const struct stacktrace_ops *ops, void *data)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index d35db5993fd6..54b0a3276766 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -19,10 +19,8 @@
19 19
20#include "dumpstack.h" 20#include "dumpstack.h"
21 21
22static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 22
23 unsigned *usedp, char **idp) 23static char x86_stack_ids[][8] = {
24{
25 static char ids[][8] = {
26 [DEBUG_STACK - 1] = "#DB", 24 [DEBUG_STACK - 1] = "#DB",
27 [NMI_STACK - 1] = "NMI", 25 [NMI_STACK - 1] = "NMI",
28 [DOUBLEFAULT_STACK - 1] = "#DF", 26 [DOUBLEFAULT_STACK - 1] = "#DF",
@@ -33,6 +31,15 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
33 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 31 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
34#endif 32#endif
35 }; 33 };
34
35int x86_is_stack_id(int id, char *name)
36{
37 return x86_stack_ids[id - 1] == name;
38}
39
40static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
41 unsigned *usedp, char **idp)
42{
36 unsigned k; 43 unsigned k;
37 44
38 /* 45 /*
@@ -61,7 +68,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
61 if (*usedp & (1U << k)) 68 if (*usedp & (1U << k))
62 break; 69 break;
63 *usedp |= 1U << k; 70 *usedp |= 1U << k;
64 *idp = ids[k]; 71 *idp = x86_stack_ids[k];
65 return (unsigned long *)end; 72 return (unsigned long *)end;
66 } 73 }
67 /* 74 /*
@@ -81,12 +88,13 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
81 do { 88 do {
82 ++j; 89 ++j;
83 end -= EXCEPTION_STKSZ; 90 end -= EXCEPTION_STKSZ;
84 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); 91 x86_stack_ids[j][4] = '1' +
92 (j - N_EXCEPTION_STACKS);
85 } while (stack < end - EXCEPTION_STKSZ); 93 } while (stack < end - EXCEPTION_STKSZ);
86 if (*usedp & (1U << j)) 94 if (*usedp & (1U << j))
87 break; 95 break;
88 *usedp |= 1U << j; 96 *usedp |= 1U << j;
89 *idp = ids[j]; 97 *idp = x86_stack_ids[j];
90 return (unsigned long *)end; 98 return (unsigned long *)end;
91 } 99 }
92#endif 100#endif
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c4ca89d9aaf4..5cb5725b2bae 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -627,10 +627,9 @@ __init void e820_setup_gap(void)
627#ifdef CONFIG_X86_64 627#ifdef CONFIG_X86_64
628 if (!found) { 628 if (!found) {
629 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; 629 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
630 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " 630 printk(KERN_ERR
631 "address range\n" 631 "PCI: Warning: Cannot find a gap in the 32bit address range\n"
632 KERN_ERR "PCI: Unassigned devices with 32bit resource " 632 "PCI: Unassigned devices with 32bit resource registers may break!\n");
633 "registers may break!\n");
634 } 633 }
635#endif 634#endif
636 635
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 96f7ac0bbf01..fe26ba3e3451 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -354,7 +354,7 @@ void __init efi_init(void)
354 */ 354 */
355 c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); 355 c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
356 if (c16) { 356 if (c16) {
357 for (i = 0; i < sizeof(vendor) && *c16; ++i) 357 for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
358 vendor[i] = *c16++; 358 vendor[i] = *c16++;
359 vendor[i] = '\0'; 359 vendor[i] = '\0';
360 } else 360 } else
@@ -512,7 +512,7 @@ void __init efi_enter_virtual_mode(void)
512 && end_pfn <= max_pfn_mapped)) 512 && end_pfn <= max_pfn_mapped))
513 va = __va(md->phys_addr); 513 va = __va(md->phys_addr);
514 else 514 else
515 va = efi_ioremap(md->phys_addr, size); 515 va = efi_ioremap(md->phys_addr, size, md->type);
516 516
517 md->virt_addr = (u64) (unsigned long) va; 517 md->virt_addr = (u64) (unsigned long) va;
518 518
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 22c3b7828c50..ac0621a7ac3d 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -98,10 +98,14 @@ void __init efi_call_phys_epilog(void)
98 early_runtime_code_mapping_set_exec(0); 98 early_runtime_code_mapping_set_exec(0);
99} 99}
100 100
101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) 101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
102 u32 type)
102{ 103{
103 unsigned long last_map_pfn; 104 unsigned long last_map_pfn;
104 105
106 if (type == EFI_MEMORY_MAPPED_IO)
107 return ioremap(phys_addr, size);
108
105 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); 109 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
106 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) 110 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
107 return NULL; 111 return NULL;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 8663afb56535..0d98a01cbdb2 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -602,7 +602,11 @@ ignore_int:
602#endif 602#endif
603 iret 603 iret
604 604
605.section .cpuinit.data,"wa" 605#ifndef CONFIG_HOTPLUG_CPU
606 __CPUINITDATA
607#else
608 __REFDATA
609#endif
606.align 4 610.align 4
607ENTRY(initial_code) 611ENTRY(initial_code)
608 .long i386_start_kernel 612 .long i386_start_kernel
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 696f0e475c2d..92b7703d3d58 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -187,7 +187,7 @@ static void __init apic_intr_init(void)
187#ifdef CONFIG_X86_THERMAL_VECTOR 187#ifdef CONFIG_X86_THERMAL_VECTOR
188 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 188 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
189#endif 189#endif
190#ifdef CONFIG_X86_THRESHOLD 190#ifdef CONFIG_X86_MCE_THRESHOLD
191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
192#endif 192#endif
193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) 193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a78ecad0c900..c664d515f613 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -200,7 +200,7 @@ static void kvm_leave_lazy_mmu(void)
200 state->mode = paravirt_get_lazy_mode(); 200 state->mode = paravirt_get_lazy_mode();
201} 201}
202 202
203static void paravirt_ops_setup(void) 203static void __init paravirt_ops_setup(void)
204{ 204{
205 pv_info.name = "KVM"; 205 pv_info.name = "KVM";
206 pv_info.paravirt_enabled = 1; 206 pv_info.paravirt_enabled = 1;
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 846510b78a09..2a62d843f015 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -347,7 +347,7 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id)
347 347
348static struct irqaction mfgptirq = { 348static struct irqaction mfgptirq = {
349 .handler = mfgpt_tick, 349 .handler = mfgpt_tick,
350 .flags = IRQF_DISABLED | IRQF_NOBALANCING, 350 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
351 .name = "mfgpt-timer" 351 .name = "mfgpt-timer"
352}; 352};
353 353
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index cfd9f9063896..d2e56b8f48e7 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -675,7 +675,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
675 nommu: 675 nommu:
676 /* Should not happen anymore */ 676 /* Should not happen anymore */
677 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 677 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
678 KERN_WARNING "falling back to iommu=soft.\n"); 678 "falling back to iommu=soft.\n");
679 return -1; 679 return -1;
680} 680}
681 681
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 4f9c55f3a7c0..03801f2f761f 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -60,7 +60,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
60 "adc %5,%%edx ; " 60 "adc %5,%%edx ; "
61 : "=A" (product), "=r" (tmp1), "=r" (tmp2) 61 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
62 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); 62 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
63#elif __x86_64__ 63#elif defined(__x86_64__)
64 __asm__ ( 64 __asm__ (
65 "mul %%rdx ; shrd $32,%%rdx,%%rax" 65 "mul %%rdx ; shrd $32,%%rdx,%%rax"
66 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); 66 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d2d1ce8170f0..a06e8d101844 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -3,6 +3,7 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/pm.h> 4#include <linux/pm.h>
5#include <linux/efi.h> 5#include <linux/efi.h>
6#include <linux/dmi.h>
6#include <acpi/reboot.h> 7#include <acpi/reboot.h>
7#include <asm/io.h> 8#include <asm/io.h>
8#include <asm/apic.h> 9#include <asm/apic.h>
@@ -17,7 +18,6 @@
17#include <asm/cpu.h> 18#include <asm/cpu.h>
18 19
19#ifdef CONFIG_X86_32 20#ifdef CONFIG_X86_32
20# include <linux/dmi.h>
21# include <linux/ctype.h> 21# include <linux/ctype.h>
22# include <linux/mc146818rtc.h> 22# include <linux/mc146818rtc.h>
23#else 23#else
@@ -249,6 +249,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
249 DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), 249 DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
250 }, 250 },
251 }, 251 },
252 { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */
253 .callback = set_bios_reboot,
254 .ident = "CompuLab SBC-FITPC2",
255 .matches = {
256 DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"),
257 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
258 },
259 },
252 { } 260 { }
253}; 261};
254 262
@@ -396,6 +404,46 @@ EXPORT_SYMBOL(machine_real_restart);
396 404
397#endif /* CONFIG_X86_32 */ 405#endif /* CONFIG_X86_32 */
398 406
407/*
408 * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
409 */
410static int __init set_pci_reboot(const struct dmi_system_id *d)
411{
412 if (reboot_type != BOOT_CF9) {
413 reboot_type = BOOT_CF9;
414 printk(KERN_INFO "%s series board detected. "
415 "Selecting PCI-method for reboots.\n", d->ident);
416 }
417 return 0;
418}
419
420static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
421 { /* Handle problems with rebooting on Apple MacBook5 */
422 .callback = set_pci_reboot,
423 .ident = "Apple MacBook5",
424 .matches = {
425 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
426 DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
427 },
428 },
429 { /* Handle problems with rebooting on Apple MacBookPro5 */
430 .callback = set_pci_reboot,
431 .ident = "Apple MacBookPro5",
432 .matches = {
433 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
434 DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
435 },
436 },
437 { }
438};
439
440static int __init pci_reboot_init(void)
441{
442 dmi_check_system(pci_reboot_dmi_table);
443 return 0;
444}
445core_initcall(pci_reboot_init);
446
399static inline void kb_wait(void) 447static inline void kb_wait(void)
400{ 448{
401 int i; 449 int i;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index de2cab132844..63f32d220ef2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -672,6 +672,19 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
672 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), 672 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
673 }, 673 },
674 }, 674 },
675 {
676 /*
677 * AMI BIOS with low memory corruption was found on Intel DG45ID board.
678 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
679 * match only DMI_BOARD_NAME and see if there is more bad products
680 * with this vendor.
681 */
682 .callback = dmi_low_memory_corruption,
683 .ident = "AMI BIOS",
684 .matches = {
685 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
686 },
687 },
675#endif 688#endif
676 {} 689 {}
677}; 690};
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 7501bb14bd51..a26ff61e2fb0 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -177,7 +177,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
177 } 177 }
178 178
179 /* allocate and build unit_map */ 179 /* allocate and build unit_map */
180 unit_map_size = num_possible_cpus() * sizeof(int); 180 unit_map_size = nr_cpu_ids * sizeof(int);
181 unit_map = alloc_bootmem_nopanic(unit_map_size); 181 unit_map = alloc_bootmem_nopanic(unit_map_size);
182 if (!unit_map) { 182 if (!unit_map) {
183 pr_warning("PERCPU: failed to allocate unit_map\n"); 183 pr_warning("PERCPU: failed to allocate unit_map\n");
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6e1a368d21d4..71f4368b357e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -275,15 +275,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
275 * use the TSC value at the transitions to calculate a pretty 275 * use the TSC value at the transitions to calculate a pretty
276 * good value for the TSC frequencty. 276 * good value for the TSC frequencty.
277 */ 277 */
278static inline int pit_verify_msb(unsigned char val)
279{
280 /* Ignore LSB */
281 inb(0x42);
282 return inb(0x42) == val;
283}
284
278static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) 285static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
279{ 286{
280 int count; 287 int count;
281 u64 tsc = 0; 288 u64 tsc = 0;
282 289
283 for (count = 0; count < 50000; count++) { 290 for (count = 0; count < 50000; count++) {
284 /* Ignore LSB */ 291 if (!pit_verify_msb(val))
285 inb(0x42);
286 if (inb(0x42) != val)
287 break; 292 break;
288 tsc = get_cycles(); 293 tsc = get_cycles();
289 } 294 }
@@ -336,8 +341,7 @@ static unsigned long quick_pit_calibrate(void)
336 * to do that is to just read back the 16-bit counter 341 * to do that is to just read back the 16-bit counter
337 * once from the PIT. 342 * once from the PIT.
338 */ 343 */
339 inb(0x42); 344 pit_verify_msb(0);
340 inb(0x42);
341 345
342 if (pit_expect_msb(0xff, &tsc, &d1)) { 346 if (pit_expect_msb(0xff, &tsc, &d1)) {
343 for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { 347 for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
@@ -348,8 +352,19 @@ static unsigned long quick_pit_calibrate(void)
348 * Iterate until the error is less than 500 ppm 352 * Iterate until the error is less than 500 ppm
349 */ 353 */
350 delta -= tsc; 354 delta -= tsc;
351 if (d1+d2 < delta >> 11) 355 if (d1+d2 >= delta >> 11)
352 goto success; 356 continue;
357
358 /*
359 * Check the PIT one more time to verify that
360 * all TSC reads were stable wrt the PIT.
361 *
362 * This also guarantees serialization of the
363 * last cycle read ('d2') in pit_expect_msb.
364 */
365 if (!pit_verify_msb(0xfe - i))
366 break;
367 goto success;
353 } 368 }
354 } 369 }
355 printk("Fast TSC calibration failed\n"); 370 printk("Fast TSC calibration failed\n");
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index b263423fbe2a..95a7289e4b0c 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -441,7 +441,7 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
441 ap.ds = __USER_DS; 441 ap.ds = __USER_DS;
442 ap.es = __USER_DS; 442 ap.es = __USER_DS;
443 ap.fs = __KERNEL_PERCPU; 443 ap.fs = __KERNEL_PERCPU;
444 ap.gs = 0; 444 ap.gs = __KERNEL_STACK_CANARY;
445 445
446 ap.eflags = 0; 446 ap.eflags = 0;
447 447
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index b600c843710b..bbf4fd044d07 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -112,11 +112,6 @@ SECTIONS
112 _sdata = .; 112 _sdata = .;
113 DATA_DATA 113 DATA_DATA
114 CONSTRUCTORS 114 CONSTRUCTORS
115
116#ifdef CONFIG_X86_64
117 /* End of data section */
118 _edata = .;
119#endif
120 } :data 115 } :data
121 116
122#ifdef CONFIG_X86_32 117#ifdef CONFIG_X86_32
@@ -156,10 +151,8 @@ SECTIONS
156 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { 151 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
157 *(.data.read_mostly) 152 *(.data.read_mostly)
158 153
159#ifdef CONFIG_X86_32
160 /* End of data section */ 154 /* End of data section */
161 _edata = .; 155 _edata = .;
162#endif
163 } 156 }
164 157
165#ifdef CONFIG_X86_64 158#ifdef CONFIG_X86_64
@@ -397,8 +390,8 @@ SECTIONS
397 390
398 391
399#ifdef CONFIG_X86_32 392#ifdef CONFIG_X86_32
400ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), 393. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
401 "kernel image bigger than KERNEL_IMAGE_SIZE") 394 "kernel image bigger than KERNEL_IMAGE_SIZE");
402#else 395#else
403/* 396/*
404 * Per-cpu symbols which need to be offset from __per_cpu_load 397 * Per-cpu symbols which need to be offset from __per_cpu_load
@@ -411,12 +404,12 @@ INIT_PER_CPU(irq_stack_union);
411/* 404/*
412 * Build-time check on the image size: 405 * Build-time check on the image size:
413 */ 406 */
414ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), 407. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
415 "kernel image bigger than KERNEL_IMAGE_SIZE") 408 "kernel image bigger than KERNEL_IMAGE_SIZE");
416 409
417#ifdef CONFIG_SMP 410#ifdef CONFIG_SMP
418ASSERT((per_cpu__irq_stack_union == 0), 411. = ASSERT((per_cpu__irq_stack_union == 0),
419 "irq_stack_union is not at start of per-cpu area"); 412 "irq_stack_union is not at start of per-cpu area");
420#endif 413#endif
421 414
422#endif /* CONFIG_X86_32 */ 415#endif /* CONFIG_X86_32 */
@@ -424,7 +417,7 @@ ASSERT((per_cpu__irq_stack_union == 0),
424#ifdef CONFIG_KEXEC 417#ifdef CONFIG_KEXEC
425#include <asm/kexec.h> 418#include <asm/kexec.h>
426 419
427ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, 420. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
428 "kexec control code size is too big") 421 "kexec control code size is too big");
429#endif 422#endif
430 423
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 4d6f0d293ee2..21f68e00524f 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -104,6 +104,9 @@ static s64 __kpit_elapsed(struct kvm *kvm)
104 ktime_t remaining; 104 ktime_t remaining;
105 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; 105 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
106 106
107 if (!ps->pit_timer.period)
108 return 0;
109
107 /* 110 /*
108 * The Counter does not stop when it reaches zero. In 111 * The Counter does not stop when it reaches zero. In
109 * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to 112 * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7030b5f911bf..0ef5bb2b4043 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -489,16 +489,20 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
489 * 489 *
490 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc 490 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
491 * containing more mappings. 491 * containing more mappings.
492 *
493 * Returns the number of rmap entries before the spte was added or zero if
494 * the spte was not added.
495 *
492 */ 496 */
493static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) 497static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
494{ 498{
495 struct kvm_mmu_page *sp; 499 struct kvm_mmu_page *sp;
496 struct kvm_rmap_desc *desc; 500 struct kvm_rmap_desc *desc;
497 unsigned long *rmapp; 501 unsigned long *rmapp;
498 int i; 502 int i, count = 0;
499 503
500 if (!is_rmap_pte(*spte)) 504 if (!is_rmap_pte(*spte))
501 return; 505 return count;
502 gfn = unalias_gfn(vcpu->kvm, gfn); 506 gfn = unalias_gfn(vcpu->kvm, gfn);
503 sp = page_header(__pa(spte)); 507 sp = page_header(__pa(spte));
504 sp->gfns[spte - sp->spt] = gfn; 508 sp->gfns[spte - sp->spt] = gfn;
@@ -515,8 +519,10 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
515 } else { 519 } else {
516 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 520 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
517 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 521 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
518 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) 522 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) {
519 desc = desc->more; 523 desc = desc->more;
524 count += RMAP_EXT;
525 }
520 if (desc->shadow_ptes[RMAP_EXT-1]) { 526 if (desc->shadow_ptes[RMAP_EXT-1]) {
521 desc->more = mmu_alloc_rmap_desc(vcpu); 527 desc->more = mmu_alloc_rmap_desc(vcpu);
522 desc = desc->more; 528 desc = desc->more;
@@ -525,6 +531,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
525 ; 531 ;
526 desc->shadow_ptes[i] = spte; 532 desc->shadow_ptes[i] = spte;
527 } 533 }
534 return count;
528} 535}
529 536
530static void rmap_desc_remove_entry(unsigned long *rmapp, 537static void rmap_desc_remove_entry(unsigned long *rmapp,
@@ -754,6 +761,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
754 return young; 761 return young;
755} 762}
756 763
764#define RMAP_RECYCLE_THRESHOLD 1000
765
766static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage)
767{
768 unsigned long *rmapp;
769
770 gfn = unalias_gfn(vcpu->kvm, gfn);
771 rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
772
773 kvm_unmap_rmapp(vcpu->kvm, rmapp);
774 kvm_flush_remote_tlbs(vcpu->kvm);
775}
776
757int kvm_age_hva(struct kvm *kvm, unsigned long hva) 777int kvm_age_hva(struct kvm *kvm, unsigned long hva)
758{ 778{
759 return kvm_handle_hva(kvm, hva, kvm_age_rmapp); 779 return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
@@ -1407,24 +1427,25 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1407 */ 1427 */
1408void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1428void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1409{ 1429{
1430 int used_pages;
1431
1432 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1433 used_pages = max(0, used_pages);
1434
1410 /* 1435 /*
1411 * If we set the number of mmu pages to be smaller be than the 1436 * If we set the number of mmu pages to be smaller be than the
1412 * number of actived pages , we must to free some mmu pages before we 1437 * number of actived pages , we must to free some mmu pages before we
1413 * change the value 1438 * change the value
1414 */ 1439 */
1415 1440
1416 if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > 1441 if (used_pages > kvm_nr_mmu_pages) {
1417 kvm_nr_mmu_pages) { 1442 while (used_pages > kvm_nr_mmu_pages) {
1418 int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
1419 - kvm->arch.n_free_mmu_pages;
1420
1421 while (n_used_mmu_pages > kvm_nr_mmu_pages) {
1422 struct kvm_mmu_page *page; 1443 struct kvm_mmu_page *page;
1423 1444
1424 page = container_of(kvm->arch.active_mmu_pages.prev, 1445 page = container_of(kvm->arch.active_mmu_pages.prev,
1425 struct kvm_mmu_page, link); 1446 struct kvm_mmu_page, link);
1426 kvm_mmu_zap_page(kvm, page); 1447 kvm_mmu_zap_page(kvm, page);
1427 n_used_mmu_pages--; 1448 used_pages--;
1428 } 1449 }
1429 kvm->arch.n_free_mmu_pages = 0; 1450 kvm->arch.n_free_mmu_pages = 0;
1430 } 1451 }
@@ -1740,6 +1761,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1740{ 1761{
1741 int was_rmapped = 0; 1762 int was_rmapped = 0;
1742 int was_writeble = is_writeble_pte(*shadow_pte); 1763 int was_writeble = is_writeble_pte(*shadow_pte);
1764 int rmap_count;
1743 1765
1744 pgprintk("%s: spte %llx access %x write_fault %d" 1766 pgprintk("%s: spte %llx access %x write_fault %d"
1745 " user_fault %d gfn %lx\n", 1767 " user_fault %d gfn %lx\n",
@@ -1781,9 +1803,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1781 1803
1782 page_header_update_slot(vcpu->kvm, shadow_pte, gfn); 1804 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
1783 if (!was_rmapped) { 1805 if (!was_rmapped) {
1784 rmap_add(vcpu, shadow_pte, gfn, largepage); 1806 rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage);
1785 if (!is_rmap_pte(*shadow_pte)) 1807 if (!is_rmap_pte(*shadow_pte))
1786 kvm_release_pfn_clean(pfn); 1808 kvm_release_pfn_clean(pfn);
1809 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1810 rmap_recycle(vcpu, gfn, largepage);
1787 } else { 1811 } else {
1788 if (was_writeble) 1812 if (was_writeble)
1789 kvm_release_pfn_dirty(pfn); 1813 kvm_release_pfn_dirty(pfn);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 71510e07e69e..b1f658ad2f06 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -711,6 +711,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
711 svm->vmcb->control.tsc_offset += delta; 711 svm->vmcb->control.tsc_offset += delta;
712 vcpu->cpu = cpu; 712 vcpu->cpu = cpu;
713 kvm_migrate_timers(vcpu); 713 kvm_migrate_timers(vcpu);
714 svm->asid_generation = 0;
714 } 715 }
715 716
716 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 717 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
@@ -1031,7 +1032,6 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
1031 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; 1032 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1032 } 1033 }
1033 1034
1034 svm->vcpu.cpu = svm_data->cpu;
1035 svm->asid_generation = svm_data->asid_generation; 1035 svm->asid_generation = svm_data->asid_generation;
1036 svm->vmcb->control.asid = svm_data->next_asid++; 1036 svm->vmcb->control.asid = svm_data->next_asid++;
1037} 1037}
@@ -2300,8 +2300,8 @@ static void pre_svm_run(struct vcpu_svm *svm)
2300 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); 2300 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
2301 2301
2302 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; 2302 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
2303 if (svm->vcpu.cpu != cpu || 2303 /* FIXME: handle wraparound of asid_generation */
2304 svm->asid_generation != svm_data->asid_generation) 2304 if (svm->asid_generation != svm_data->asid_generation)
2305 new_asid(svm, svm_data); 2305 new_asid(svm, svm_data);
2306} 2306}
2307 2307
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 356a0ce85c68..29f912927a58 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3157,8 +3157,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3157 struct vcpu_vmx *vmx = to_vmx(vcpu); 3157 struct vcpu_vmx *vmx = to_vmx(vcpu);
3158 enum emulation_result err = EMULATE_DONE; 3158 enum emulation_result err = EMULATE_DONE;
3159 3159
3160 preempt_enable();
3161 local_irq_enable(); 3160 local_irq_enable();
3161 preempt_enable();
3162 3162
3163 while (!guest_state_valid(vcpu)) { 3163 while (!guest_state_valid(vcpu)) {
3164 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3164 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
@@ -3168,7 +3168,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3168 3168
3169 if (err != EMULATE_DONE) { 3169 if (err != EMULATE_DONE) {
3170 kvm_report_emulation_failure(vcpu, "emulation failure"); 3170 kvm_report_emulation_failure(vcpu, "emulation failure");
3171 return; 3171 break;
3172 } 3172 }
3173 3173
3174 if (signal_pending(current)) 3174 if (signal_pending(current))
@@ -3177,8 +3177,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3177 schedule(); 3177 schedule();
3178 } 3178 }
3179 3179
3180 local_irq_disable();
3181 preempt_disable(); 3180 preempt_disable();
3181 local_irq_disable();
3182 3182
3183 vmx->invalid_state_emulation_result = err; 3183 vmx->invalid_state_emulation_result = err;
3184} 3184}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fe5474aec41a..3d4529011828 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -704,11 +704,48 @@ static bool msr_mtrr_valid(unsigned msr)
704 return false; 704 return false;
705} 705}
706 706
707static bool valid_pat_type(unsigned t)
708{
709 return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
710}
711
712static bool valid_mtrr_type(unsigned t)
713{
714 return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
715}
716
717static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
718{
719 int i;
720
721 if (!msr_mtrr_valid(msr))
722 return false;
723
724 if (msr == MSR_IA32_CR_PAT) {
725 for (i = 0; i < 8; i++)
726 if (!valid_pat_type((data >> (i * 8)) & 0xff))
727 return false;
728 return true;
729 } else if (msr == MSR_MTRRdefType) {
730 if (data & ~0xcff)
731 return false;
732 return valid_mtrr_type(data & 0xff);
733 } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
734 for (i = 0; i < 8 ; i++)
735 if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
736 return false;
737 return true;
738 }
739
740 /* variable MTRRs */
741 return valid_mtrr_type(data & 0xff);
742}
743
707static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 744static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
708{ 745{
709 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; 746 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
710 747
711 if (!msr_mtrr_valid(msr)) 748 if (!mtrr_valid(vcpu, msr, data))
712 return 1; 749 return 1;
713 750
714 if (msr == MSR_MTRRdefType) { 751 if (msr == MSR_MTRRdefType) {
@@ -1079,14 +1116,13 @@ long kvm_arch_dev_ioctl(struct file *filp,
1079 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) 1116 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1080 goto out; 1117 goto out;
1081 r = -E2BIG; 1118 r = -E2BIG;
1082 if (n < num_msrs_to_save) 1119 if (n < msr_list.nmsrs)
1083 goto out; 1120 goto out;
1084 r = -EFAULT; 1121 r = -EFAULT;
1085 if (copy_to_user(user_msr_list->indices, &msrs_to_save, 1122 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1086 num_msrs_to_save * sizeof(u32))) 1123 num_msrs_to_save * sizeof(u32)))
1087 goto out; 1124 goto out;
1088 if (copy_to_user(user_msr_list->indices 1125 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
1089 + num_msrs_to_save * sizeof(u32),
1090 &emulated_msrs, 1126 &emulated_msrs,
1091 ARRAY_SIZE(emulated_msrs) * sizeof(u32))) 1127 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1092 goto out; 1128 goto out;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 7bc65f0f62c4..d677fa9ca650 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -22,7 +22,8 @@
22 * 22 *
23 * So how does the kernel know it's a Guest? We'll see that later, but let's 23 * So how does the kernel know it's a Guest? We'll see that later, but let's
24 * just say that we end up here where we replace the native functions various 24 * just say that we end up here where we replace the native functions various
25 * "paravirt" structures with our Guest versions, then boot like normal. :*/ 25 * "paravirt" structures with our Guest versions, then boot like normal.
26:*/
26 27
27/* 28/*
28 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. 29 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
@@ -74,7 +75,8 @@
74 * 75 *
75 * The Guest in our tale is a simple creature: identical to the Host but 76 * The Guest in our tale is a simple creature: identical to the Host but
76 * behaving in simplified but equivalent ways. In particular, the Guest is the 77 * behaving in simplified but equivalent ways. In particular, the Guest is the
77 * same kernel as the Host (or at least, built from the same source code). :*/ 78 * same kernel as the Host (or at least, built from the same source code).
79:*/
78 80
79struct lguest_data lguest_data = { 81struct lguest_data lguest_data = {
80 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, 82 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
@@ -85,7 +87,8 @@ struct lguest_data lguest_data = {
85 .syscall_vec = SYSCALL_VECTOR, 87 .syscall_vec = SYSCALL_VECTOR,
86}; 88};
87 89
88/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a 90/*G:037
91 * async_hcall() is pretty simple: I'm quite proud of it really. We have a
89 * ring buffer of stored hypercalls which the Host will run though next time we 92 * ring buffer of stored hypercalls which the Host will run though next time we
90 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall 93 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
91 * arguments, and a "hcall_status" word which is 0 if the call is ready to go, 94 * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
@@ -94,7 +97,8 @@ struct lguest_data lguest_data = {
94 * If we come around to a slot which hasn't been finished, then the table is 97 * If we come around to a slot which hasn't been finished, then the table is
95 * full and we just make the hypercall directly. This has the nice side 98 * full and we just make the hypercall directly. This has the nice side
96 * effect of causing the Host to run all the stored calls in the ring buffer 99 * effect of causing the Host to run all the stored calls in the ring buffer
97 * which empties it for next time! */ 100 * which empties it for next time!
101 */
98static void async_hcall(unsigned long call, unsigned long arg1, 102static void async_hcall(unsigned long call, unsigned long arg1,
99 unsigned long arg2, unsigned long arg3, 103 unsigned long arg2, unsigned long arg3,
100 unsigned long arg4) 104 unsigned long arg4)
@@ -103,9 +107,11 @@ static void async_hcall(unsigned long call, unsigned long arg1,
103 static unsigned int next_call; 107 static unsigned int next_call;
104 unsigned long flags; 108 unsigned long flags;
105 109
106 /* Disable interrupts if not already disabled: we don't want an 110 /*
111 * Disable interrupts if not already disabled: we don't want an
107 * interrupt handler making a hypercall while we're already doing 112 * interrupt handler making a hypercall while we're already doing
108 * one! */ 113 * one!
114 */
109 local_irq_save(flags); 115 local_irq_save(flags);
110 if (lguest_data.hcall_status[next_call] != 0xFF) { 116 if (lguest_data.hcall_status[next_call] != 0xFF) {
111 /* Table full, so do normal hcall which will flush table. */ 117 /* Table full, so do normal hcall which will flush table. */
@@ -125,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1,
125 local_irq_restore(flags); 131 local_irq_restore(flags);
126} 132}
127 133
128/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first 134/*G:035
129 * real optimization trick! 135 * Notice the lazy_hcall() above, rather than hcall(). This is our first real
136 * optimization trick!
130 * 137 *
131 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do 138 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
132 * them as a batch when lazy_mode is eventually turned off. Because hypercalls 139 * them as a batch when lazy_mode is eventually turned off. Because hypercalls
@@ -136,7 +143,8 @@ static void async_hcall(unsigned long call, unsigned long arg1,
136 * lguest_leave_lazy_mode(). 143 * lguest_leave_lazy_mode().
137 * 144 *
138 * So, when we're in lazy mode, we call async_hcall() to store the call for 145 * So, when we're in lazy mode, we call async_hcall() to store the call for
139 * future processing: */ 146 * future processing:
147 */
140static void lazy_hcall1(unsigned long call, 148static void lazy_hcall1(unsigned long call,
141 unsigned long arg1) 149 unsigned long arg1)
142{ 150{
@@ -146,6 +154,7 @@ static void lazy_hcall1(unsigned long call,
146 async_hcall(call, arg1, 0, 0, 0); 154 async_hcall(call, arg1, 0, 0, 0);
147} 155}
148 156
157/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
149static void lazy_hcall2(unsigned long call, 158static void lazy_hcall2(unsigned long call,
150 unsigned long arg1, 159 unsigned long arg1,
151 unsigned long arg2) 160 unsigned long arg2)
@@ -181,8 +190,10 @@ static void lazy_hcall4(unsigned long call,
181} 190}
182#endif 191#endif
183 192
184/* When lazy mode is turned off reset the per-cpu lazy mode variable and then 193/*G:036
185 * issue the do-nothing hypercall to flush any stored calls. */ 194 * When lazy mode is turned off reset the per-cpu lazy mode variable and then
195 * issue the do-nothing hypercall to flush any stored calls.
196:*/
186static void lguest_leave_lazy_mmu_mode(void) 197static void lguest_leave_lazy_mmu_mode(void)
187{ 198{
188 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 199 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
@@ -208,9 +219,11 @@ static void lguest_end_context_switch(struct task_struct *next)
208 * check there before it tries to deliver an interrupt. 219 * check there before it tries to deliver an interrupt.
209 */ 220 */
210 221
211/* save_flags() is expected to return the processor state (ie. "flags"). The 222/*
223 * save_flags() is expected to return the processor state (ie. "flags"). The
212 * flags word contains all kind of stuff, but in practice Linux only cares 224 * flags word contains all kind of stuff, but in practice Linux only cares
213 * about the interrupt flag. Our "save_flags()" just returns that. */ 225 * about the interrupt flag. Our "save_flags()" just returns that.
226 */
214static unsigned long save_fl(void) 227static unsigned long save_fl(void)
215{ 228{
216 return lguest_data.irq_enabled; 229 return lguest_data.irq_enabled;
@@ -222,13 +235,15 @@ static void irq_disable(void)
222 lguest_data.irq_enabled = 0; 235 lguest_data.irq_enabled = 0;
223} 236}
224 237
225/* Let's pause a moment. Remember how I said these are called so often? 238/*
239 * Let's pause a moment. Remember how I said these are called so often?
226 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to 240 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
227 * break some rules. In particular, these functions are assumed to save their 241 * break some rules. In particular, these functions are assumed to save their
228 * own registers if they need to: normal C functions assume they can trash the 242 * own registers if they need to: normal C functions assume they can trash the
229 * eax register. To use normal C functions, we use 243 * eax register. To use normal C functions, we use
230 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the 244 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
231 * C function, then restores it. */ 245 * C function, then restores it.
246 */
232PV_CALLEE_SAVE_REGS_THUNK(save_fl); 247PV_CALLEE_SAVE_REGS_THUNK(save_fl);
233PV_CALLEE_SAVE_REGS_THUNK(irq_disable); 248PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
234/*:*/ 249/*:*/
@@ -237,18 +252,18 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
237extern void lg_irq_enable(void); 252extern void lg_irq_enable(void);
238extern void lg_restore_fl(unsigned long flags); 253extern void lg_restore_fl(unsigned long flags);
239 254
240/*M:003 Note that we don't check for outstanding interrupts when we re-enable 255/*M:003
241 * them (or when we unmask an interrupt). This seems to work for the moment, 256 * We could be more efficient in our checking of outstanding interrupts, rather
242 * since interrupts are rare and we'll just get the interrupt on the next timer 257 * than using a branch. One way would be to put the "irq_enabled" field in a
243 * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way 258 * page by itself, and have the Host write-protect it when an interrupt comes
244 * would be to put the "irq_enabled" field in a page by itself, and have the 259 * in when irqs are disabled. There will then be a page fault as soon as
245 * Host write-protect it when an interrupt comes in when irqs are disabled. 260 * interrupts are re-enabled.
246 * There will then be a page fault as soon as interrupts are re-enabled.
247 * 261 *
248 * A better method is to implement soft interrupt disable generally for x86: 262 * A better method is to implement soft interrupt disable generally for x86:
249 * instead of disabling interrupts, we set a flag. If an interrupt does come 263 * instead of disabling interrupts, we set a flag. If an interrupt does come
250 * in, we then disable them for real. This is uncommon, so we could simply use 264 * in, we then disable them for real. This is uncommon, so we could simply use
251 * a hypercall for interrupt control and not worry about efficiency. :*/ 265 * a hypercall for interrupt control and not worry about efficiency.
266:*/
252 267
253/*G:034 268/*G:034
254 * The Interrupt Descriptor Table (IDT). 269 * The Interrupt Descriptor Table (IDT).
@@ -261,10 +276,12 @@ extern void lg_restore_fl(unsigned long flags);
261static void lguest_write_idt_entry(gate_desc *dt, 276static void lguest_write_idt_entry(gate_desc *dt,
262 int entrynum, const gate_desc *g) 277 int entrynum, const gate_desc *g)
263{ 278{
264 /* The gate_desc structure is 8 bytes long: we hand it to the Host in 279 /*
280 * The gate_desc structure is 8 bytes long: we hand it to the Host in
265 * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors 281 * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors
266 * around like this; typesafety wasn't a big concern in Linux's early 282 * around like this; typesafety wasn't a big concern in Linux's early
267 * years. */ 283 * years.
284 */
268 u32 *desc = (u32 *)g; 285 u32 *desc = (u32 *)g;
269 /* Keep the local copy up to date. */ 286 /* Keep the local copy up to date. */
270 native_write_idt_entry(dt, entrynum, g); 287 native_write_idt_entry(dt, entrynum, g);
@@ -272,9 +289,11 @@ static void lguest_write_idt_entry(gate_desc *dt,
272 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); 289 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
273} 290}
274 291
275/* Changing to a different IDT is very rare: we keep the IDT up-to-date every 292/*
293 * Changing to a different IDT is very rare: we keep the IDT up-to-date every
276 * time it is written, so we can simply loop through all entries and tell the 294 * time it is written, so we can simply loop through all entries and tell the
277 * Host about them. */ 295 * Host about them.
296 */
278static void lguest_load_idt(const struct desc_ptr *desc) 297static void lguest_load_idt(const struct desc_ptr *desc)
279{ 298{
280 unsigned int i; 299 unsigned int i;
@@ -305,9 +324,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc)
305 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); 324 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b);
306} 325}
307 326
308/* For a single GDT entry which changes, we do the lazy thing: alter our GDT, 327/*
328 * For a single GDT entry which changes, we do the lazy thing: alter our GDT,
309 * then tell the Host to reload the entire thing. This operation is so rare 329 * then tell the Host to reload the entire thing. This operation is so rare
310 * that this naive implementation is reasonable. */ 330 * that this naive implementation is reasonable.
331 */
311static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, 332static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
312 const void *desc, int type) 333 const void *desc, int type)
313{ 334{
@@ -317,29 +338,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
317 dt[entrynum].a, dt[entrynum].b); 338 dt[entrynum].a, dt[entrynum].b);
318} 339}
319 340
320/* OK, I lied. There are three "thread local storage" GDT entries which change 341/*
342 * OK, I lied. There are three "thread local storage" GDT entries which change
321 * on every context switch (these three entries are how glibc implements 343 * on every context switch (these three entries are how glibc implements
322 * __thread variables). So we have a hypercall specifically for this case. */ 344 * __thread variables). So we have a hypercall specifically for this case.
345 */
323static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) 346static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
324{ 347{
325 /* There's one problem which normal hardware doesn't have: the Host 348 /*
349 * There's one problem which normal hardware doesn't have: the Host
326 * can't handle us removing entries we're currently using. So we clear 350 * can't handle us removing entries we're currently using. So we clear
327 * the GS register here: if it's needed it'll be reloaded anyway. */ 351 * the GS register here: if it's needed it'll be reloaded anyway.
352 */
328 lazy_load_gs(0); 353 lazy_load_gs(0);
329 lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); 354 lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
330} 355}
331 356
332/*G:038 That's enough excitement for now, back to ploughing through each of 357/*G:038
333 * the different pv_ops structures (we're about 1/3 of the way through). 358 * That's enough excitement for now, back to ploughing through each of the
359 * different pv_ops structures (we're about 1/3 of the way through).
334 * 360 *
335 * This is the Local Descriptor Table, another weird Intel thingy. Linux only 361 * This is the Local Descriptor Table, another weird Intel thingy. Linux only
336 * uses this for some strange applications like Wine. We don't do anything 362 * uses this for some strange applications like Wine. We don't do anything
337 * here, so they'll get an informative and friendly Segmentation Fault. */ 363 * here, so they'll get an informative and friendly Segmentation Fault.
364 */
338static void lguest_set_ldt(const void *addr, unsigned entries) 365static void lguest_set_ldt(const void *addr, unsigned entries)
339{ 366{
340} 367}
341 368
342/* This loads a GDT entry into the "Task Register": that entry points to a 369/*
370 * This loads a GDT entry into the "Task Register": that entry points to a
343 * structure called the Task State Segment. Some comments scattered though the 371 * structure called the Task State Segment. Some comments scattered though the
344 * kernel code indicate that this used for task switching in ages past, along 372 * kernel code indicate that this used for task switching in ages past, along
345 * with blood sacrifice and astrology. 373 * with blood sacrifice and astrology.
@@ -347,19 +375,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries)
347 * Now there's nothing interesting in here that we don't get told elsewhere. 375 * Now there's nothing interesting in here that we don't get told elsewhere.
348 * But the native version uses the "ltr" instruction, which makes the Host 376 * But the native version uses the "ltr" instruction, which makes the Host
349 * complain to the Guest about a Segmentation Fault and it'll oops. So we 377 * complain to the Guest about a Segmentation Fault and it'll oops. So we
350 * override the native version with a do-nothing version. */ 378 * override the native version with a do-nothing version.
379 */
351static void lguest_load_tr_desc(void) 380static void lguest_load_tr_desc(void)
352{ 381{
353} 382}
354 383
355/* The "cpuid" instruction is a way of querying both the CPU identity 384/*
385 * The "cpuid" instruction is a way of querying both the CPU identity
356 * (manufacturer, model, etc) and its features. It was introduced before the 386 * (manufacturer, model, etc) and its features. It was introduced before the
357 * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. 387 * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
358 * As you might imagine, after a decade and a half this treatment, it is now a 388 * As you might imagine, after a decade and a half this treatment, it is now a
359 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. 389 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
360 * 390 *
361 * This instruction even it has its own Wikipedia entry. The Wikipedia entry 391 * This instruction even it has its own Wikipedia entry. The Wikipedia entry
362 * has been translated into 4 languages. I am not making this up! 392 * has been translated into 5 languages. I am not making this up!
363 * 393 *
364 * We could get funky here and identify ourselves as "GenuineLguest", but 394 * We could get funky here and identify ourselves as "GenuineLguest", but
365 * instead we just use the real "cpuid" instruction. Then I pretty much turned 395 * instead we just use the real "cpuid" instruction. Then I pretty much turned
@@ -371,7 +401,8 @@ static void lguest_load_tr_desc(void)
371 * Replacing the cpuid so we can turn features off is great for the kernel, but 401 * Replacing the cpuid so we can turn features off is great for the kernel, but
372 * anyone (including userspace) can just use the raw "cpuid" instruction and 402 * anyone (including userspace) can just use the raw "cpuid" instruction and
373 * the Host won't even notice since it isn't privileged. So we try not to get 403 * the Host won't even notice since it isn't privileged. So we try not to get
374 * too worked up about it. */ 404 * too worked up about it.
405 */
375static void lguest_cpuid(unsigned int *ax, unsigned int *bx, 406static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
376 unsigned int *cx, unsigned int *dx) 407 unsigned int *cx, unsigned int *dx)
377{ 408{
@@ -379,38 +410,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
379 410
380 native_cpuid(ax, bx, cx, dx); 411 native_cpuid(ax, bx, cx, dx);
381 switch (function) { 412 switch (function) {
382 case 1: /* Basic feature request. */ 413 /*
383 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 414 * CPUID 0 gives the highest legal CPUID number (and the ID string).
415 * We futureproof our code a little by sticking to known CPUID values.
416 */
417 case 0:
418 if (*ax > 5)
419 *ax = 5;
420 break;
421
422 /*
423 * CPUID 1 is a basic feature request.
424 *
425 * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
426 * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
427 */
428 case 1:
384 *cx &= 0x00002201; 429 *cx &= 0x00002201;
385 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */
386 *dx &= 0x07808151; 430 *dx &= 0x07808151;
387 /* The Host can do a nice optimization if it knows that the 431 /*
432 * The Host can do a nice optimization if it knows that the
388 * kernel mappings (addresses above 0xC0000000 or whatever 433 * kernel mappings (addresses above 0xC0000000 or whatever
389 * PAGE_OFFSET is set to) haven't changed. But Linux calls 434 * PAGE_OFFSET is set to) haven't changed. But Linux calls
390 * flush_tlb_user() for both user and kernel mappings unless 435 * flush_tlb_user() for both user and kernel mappings unless
391 * the Page Global Enable (PGE) feature bit is set. */ 436 * the Page Global Enable (PGE) feature bit is set.
437 */
392 *dx |= 0x00002000; 438 *dx |= 0x00002000;
393 /* We also lie, and say we're family id 5. 6 or greater 439 /*
440 * We also lie, and say we're family id 5. 6 or greater
394 * leads to a rdmsr in early_init_intel which we can't handle. 441 * leads to a rdmsr in early_init_intel which we can't handle.
395 * Family ID is returned as bits 8-12 in ax. */ 442 * Family ID is returned as bits 8-12 in ax.
443 */
396 *ax &= 0xFFFFF0FF; 444 *ax &= 0xFFFFF0FF;
397 *ax |= 0x00000500; 445 *ax |= 0x00000500;
398 break; 446 break;
447 /*
448 * 0x80000000 returns the highest Extended Function, so we futureproof
449 * like we do above by limiting it to known fields.
450 */
399 case 0x80000000: 451 case 0x80000000:
400 /* Futureproof this a little: if they ask how much extended
401 * processor information there is, limit it to known fields. */
402 if (*ax > 0x80000008) 452 if (*ax > 0x80000008)
403 *ax = 0x80000008; 453 *ax = 0x80000008;
404 break; 454 break;
455
456 /*
457 * PAE systems can mark pages as non-executable. Linux calls this the
458 * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
459 * Virus Protection). We just switch turn if off here, since we don't
460 * support it.
461 */
405 case 0x80000001: 462 case 0x80000001:
406 /* Here we should fix nx cap depending on host. */
407 /* For this version of PAE, we just clear NX bit. */
408 *dx &= ~(1 << 20); 463 *dx &= ~(1 << 20);
409 break; 464 break;
410 } 465 }
411} 466}
412 467
413/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. 468/*
469 * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
414 * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother 470 * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
415 * it. The Host needs to know when the Guest wants to change them, so we have 471 * it. The Host needs to know when the Guest wants to change them, so we have
416 * a whole series of functions like read_cr0() and write_cr0(). 472 * a whole series of functions like read_cr0() and write_cr0().
@@ -425,7 +481,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
425 * name like "FPUTRAP bit" be a little less cryptic? 481 * name like "FPUTRAP bit" be a little less cryptic?
426 * 482 *
427 * We store cr0 locally because the Host never changes it. The Guest sometimes 483 * We store cr0 locally because the Host never changes it. The Guest sometimes
428 * wants to read it and we'd prefer not to bother the Host unnecessarily. */ 484 * wants to read it and we'd prefer not to bother the Host unnecessarily.
485 */
429static unsigned long current_cr0; 486static unsigned long current_cr0;
430static void lguest_write_cr0(unsigned long val) 487static void lguest_write_cr0(unsigned long val)
431{ 488{
@@ -438,18 +495,22 @@ static unsigned long lguest_read_cr0(void)
438 return current_cr0; 495 return current_cr0;
439} 496}
440 497
441/* Intel provided a special instruction to clear the TS bit for people too cool 498/*
499 * Intel provided a special instruction to clear the TS bit for people too cool
442 * to use write_cr0() to do it. This "clts" instruction is faster, because all 500 * to use write_cr0() to do it. This "clts" instruction is faster, because all
443 * the vowels have been optimized out. */ 501 * the vowels have been optimized out.
502 */
444static void lguest_clts(void) 503static void lguest_clts(void)
445{ 504{
446 lazy_hcall1(LHCALL_TS, 0); 505 lazy_hcall1(LHCALL_TS, 0);
447 current_cr0 &= ~X86_CR0_TS; 506 current_cr0 &= ~X86_CR0_TS;
448} 507}
449 508
450/* cr2 is the virtual address of the last page fault, which the Guest only ever 509/*
510 * cr2 is the virtual address of the last page fault, which the Guest only ever
451 * reads. The Host kindly writes this into our "struct lguest_data", so we 511 * reads. The Host kindly writes this into our "struct lguest_data", so we
452 * just read it out of there. */ 512 * just read it out of there.
513 */
453static unsigned long lguest_read_cr2(void) 514static unsigned long lguest_read_cr2(void)
454{ 515{
455 return lguest_data.cr2; 516 return lguest_data.cr2;
@@ -458,10 +519,12 @@ static unsigned long lguest_read_cr2(void)
458/* See lguest_set_pte() below. */ 519/* See lguest_set_pte() below. */
459static bool cr3_changed = false; 520static bool cr3_changed = false;
460 521
461/* cr3 is the current toplevel pagetable page: the principle is the same as 522/*
523 * cr3 is the current toplevel pagetable page: the principle is the same as
462 * cr0. Keep a local copy, and tell the Host when it changes. The only 524 * cr0. Keep a local copy, and tell the Host when it changes. The only
463 * difference is that our local copy is in lguest_data because the Host needs 525 * difference is that our local copy is in lguest_data because the Host needs
464 * to set it upon our initial hypercall. */ 526 * to set it upon our initial hypercall.
527 */
465static void lguest_write_cr3(unsigned long cr3) 528static void lguest_write_cr3(unsigned long cr3)
466{ 529{
467 lguest_data.pgdir = cr3; 530 lguest_data.pgdir = cr3;
@@ -506,7 +569,7 @@ static void lguest_write_cr4(unsigned long val)
506 * cr3 ---> +---------+ 569 * cr3 ---> +---------+
507 * | --------->+---------+ 570 * | --------->+---------+
508 * | | | PADDR1 | 571 * | | | PADDR1 |
509 * Top-level | | PADDR2 | 572 * Mid-level | | PADDR2 |
510 * (PMD) page | | | 573 * (PMD) page | | |
511 * | | Lower-level | 574 * | | Lower-level |
512 * | | (PTE) page | 575 * | | (PTE) page |
@@ -526,21 +589,62 @@ static void lguest_write_cr4(unsigned long val)
526 * Index into top Index into second Offset within page 589 * Index into top Index into second Offset within page
527 * page directory page pagetable page 590 * page directory page pagetable page
528 * 591 *
529 * The kernel spends a lot of time changing both the top-level page directory 592 * Now, unfortunately, this isn't the whole story: Intel added Physical Address
530 * and lower-level pagetable pages. The Guest doesn't know physical addresses, 593 * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
531 * so while it maintains these page tables exactly like normal, it also needs 594 * These are held in 64-bit page table entries, so we can now only fit 512
532 * to keep the Host informed whenever it makes a change: the Host will create 595 * entries in a page, and the neat three-level tree breaks down.
533 * the real page tables based on the Guests'. 596 *
597 * The result is a four level page table:
598 *
599 * cr3 --> [ 4 Upper ]
600 * [ Level ]
601 * [ Entries ]
602 * [(PUD Page)]---> +---------+
603 * | --------->+---------+
604 * | | | PADDR1 |
605 * Mid-level | | PADDR2 |
606 * (PMD) page | | |
607 * | | Lower-level |
608 * | | (PTE) page |
609 * | | | |
610 * .... ....
611 *
612 *
613 * And the virtual address is decoded as:
614 *
615 * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
616 * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
617 * Index into Index into mid Index into lower Offset within page
618 * top entries directory page pagetable page
619 *
620 * It's too hard to switch between these two formats at runtime, so Linux only
621 * supports one or the other depending on whether CONFIG_X86_PAE is set. Many
622 * distributions turn it on, and not just for people with silly amounts of
623 * memory: the larger PTE entries allow room for the NX bit, which lets the
624 * kernel disable execution of pages and increase security.
625 *
626 * This was a problem for lguest, which couldn't run on these distributions;
627 * then Matias Zabaljauregui figured it all out and implemented it, and only a
628 * handful of puppies were crushed in the process!
629 *
630 * Back to our point: the kernel spends a lot of time changing both the
631 * top-level page directory and lower-level pagetable pages. The Guest doesn't
632 * know physical addresses, so while it maintains these page tables exactly
633 * like normal, it also needs to keep the Host informed whenever it makes a
634 * change: the Host will create the real page tables based on the Guests'.
534 */ 635 */
535 636
536/* The Guest calls this to set a second-level entry (pte), ie. to map a page 637/*
537 * into a process' address space. We set the entry then tell the Host the 638 * The Guest calls this after it has set a second-level entry (pte), ie. to map
538 * toplevel and address this corresponds to. The Guest uses one pagetable per 639 * a page into a process' address space. Wetell the Host the toplevel and
539 * process, so we need to tell the Host which one we're changing (mm->pgd). */ 640 * address this corresponds to. The Guest uses one pagetable per process, so
641 * we need to tell the Host which one we're changing (mm->pgd).
642 */
540static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 643static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
541 pte_t *ptep) 644 pte_t *ptep)
542{ 645{
543#ifdef CONFIG_X86_PAE 646#ifdef CONFIG_X86_PAE
647 /* PAE needs to hand a 64 bit page table entry, so it uses two args. */
544 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, 648 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
545 ptep->pte_low, ptep->pte_high); 649 ptep->pte_low, ptep->pte_high);
546#else 650#else
@@ -548,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
548#endif 652#endif
549} 653}
550 654
655/* This is the "set and update" combo-meal-deal version. */
551static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 656static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
552 pte_t *ptep, pte_t pteval) 657 pte_t *ptep, pte_t pteval)
553{ 658{
@@ -555,10 +660,13 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
555 lguest_pte_update(mm, addr, ptep); 660 lguest_pte_update(mm, addr, ptep);
556} 661}
557 662
558/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd 663/*
664 * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
559 * to set a middle-level entry when PAE is activated. 665 * to set a middle-level entry when PAE is activated.
666 *
560 * Again, we set the entry then tell the Host which page we changed, 667 * Again, we set the entry then tell the Host which page we changed,
561 * and the index of the entry we changed. */ 668 * and the index of the entry we changed.
669 */
562#ifdef CONFIG_X86_PAE 670#ifdef CONFIG_X86_PAE
563static void lguest_set_pud(pud_t *pudp, pud_t pudval) 671static void lguest_set_pud(pud_t *pudp, pud_t pudval)
564{ 672{
@@ -577,8 +685,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
577} 685}
578#else 686#else
579 687
580/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not 688/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
581 * activated. */
582static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 689static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
583{ 690{
584 native_set_pmd(pmdp, pmdval); 691 native_set_pmd(pmdp, pmdval);
@@ -587,7 +694,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
587} 694}
588#endif 695#endif
589 696
590/* There are a couple of legacy places where the kernel sets a PTE, but we 697/*
698 * There are a couple of legacy places where the kernel sets a PTE, but we
591 * don't know the top level any more. This is useless for us, since we don't 699 * don't know the top level any more. This is useless for us, since we don't
592 * know which pagetable is changing or what address, so we just tell the Host 700 * know which pagetable is changing or what address, so we just tell the Host
593 * to forget all of them. Fortunately, this is very rare. 701 * to forget all of them. Fortunately, this is very rare.
@@ -595,7 +703,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
595 * ... except in early boot when the kernel sets up the initial pagetables, 703 * ... except in early boot when the kernel sets up the initial pagetables,
596 * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell 704 * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell
597 * the Host anything changed until we've done the first page table switch, 705 * the Host anything changed until we've done the first page table switch,
598 * which brings boot back to 0.25 seconds. */ 706 * which brings boot back to 0.25 seconds.
707 */
599static void lguest_set_pte(pte_t *ptep, pte_t pteval) 708static void lguest_set_pte(pte_t *ptep, pte_t pteval)
600{ 709{
601 native_set_pte(ptep, pteval); 710 native_set_pte(ptep, pteval);
@@ -604,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
604} 713}
605 714
606#ifdef CONFIG_X86_PAE 715#ifdef CONFIG_X86_PAE
716/*
717 * With 64-bit PTE values, we need to be careful setting them: if we set 32
718 * bits at a time, the hardware could see a weird half-set entry. These
719 * versions ensure we update all 64 bits at once.
720 */
607static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) 721static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
608{ 722{
609 native_set_pte_atomic(ptep, pte); 723 native_set_pte_atomic(ptep, pte);
@@ -611,19 +725,21 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
611 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 725 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
612} 726}
613 727
614void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 728static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
729 pte_t *ptep)
615{ 730{
616 native_pte_clear(mm, addr, ptep); 731 native_pte_clear(mm, addr, ptep);
617 lguest_pte_update(mm, addr, ptep); 732 lguest_pte_update(mm, addr, ptep);
618} 733}
619 734
620void lguest_pmd_clear(pmd_t *pmdp) 735static void lguest_pmd_clear(pmd_t *pmdp)
621{ 736{
622 lguest_set_pmd(pmdp, __pmd(0)); 737 lguest_set_pmd(pmdp, __pmd(0));
623} 738}
624#endif 739#endif
625 740
626/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 741/*
742 * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
627 * native page table operations. On native hardware you can set a new page 743 * native page table operations. On native hardware you can set a new page
628 * table entry whenever you want, but if you want to remove one you have to do 744 * table entry whenever you want, but if you want to remove one you have to do
629 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). 745 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
@@ -632,24 +748,29 @@ void lguest_pmd_clear(pmd_t *pmdp)
632 * called when a valid entry is written, not when it's removed (ie. marked not 748 * called when a valid entry is written, not when it's removed (ie. marked not
633 * present). Instead, this is where we come when the Guest wants to remove a 749 * present). Instead, this is where we come when the Guest wants to remove a
634 * page table entry: we tell the Host to set that entry to 0 (ie. the present 750 * page table entry: we tell the Host to set that entry to 0 (ie. the present
635 * bit is zero). */ 751 * bit is zero).
752 */
636static void lguest_flush_tlb_single(unsigned long addr) 753static void lguest_flush_tlb_single(unsigned long addr)
637{ 754{
638 /* Simply set it to zero: if it was not, it will fault back in. */ 755 /* Simply set it to zero: if it was not, it will fault back in. */
639 lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); 756 lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
640} 757}
641 758
642/* This is what happens after the Guest has removed a large number of entries. 759/*
760 * This is what happens after the Guest has removed a large number of entries.
643 * This tells the Host that any of the page table entries for userspace might 761 * This tells the Host that any of the page table entries for userspace might
644 * have changed, ie. virtual addresses below PAGE_OFFSET. */ 762 * have changed, ie. virtual addresses below PAGE_OFFSET.
763 */
645static void lguest_flush_tlb_user(void) 764static void lguest_flush_tlb_user(void)
646{ 765{
647 lazy_hcall1(LHCALL_FLUSH_TLB, 0); 766 lazy_hcall1(LHCALL_FLUSH_TLB, 0);
648} 767}
649 768
650/* This is called when the kernel page tables have changed. That's not very 769/*
770 * This is called when the kernel page tables have changed. That's not very
651 * common (unless the Guest is using highmem, which makes the Guest extremely 771 * common (unless the Guest is using highmem, which makes the Guest extremely
652 * slow), so it's worth separating this from the user flushing above. */ 772 * slow), so it's worth separating this from the user flushing above.
773 */
653static void lguest_flush_tlb_kernel(void) 774static void lguest_flush_tlb_kernel(void)
654{ 775{
655 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 776 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
@@ -686,26 +807,38 @@ static struct irq_chip lguest_irq_controller = {
686 .unmask = enable_lguest_irq, 807 .unmask = enable_lguest_irq,
687}; 808};
688 809
689/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware 810/*
811 * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
690 * interrupt (except 128, which is used for system calls), and then tells the 812 * interrupt (except 128, which is used for system calls), and then tells the
691 * Linux infrastructure that each interrupt is controlled by our level-based 813 * Linux infrastructure that each interrupt is controlled by our level-based
692 * lguest interrupt controller. */ 814 * lguest interrupt controller.
815 */
693static void __init lguest_init_IRQ(void) 816static void __init lguest_init_IRQ(void)
694{ 817{
695 unsigned int i; 818 unsigned int i;
696 819
697 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 820 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
698 /* Some systems map "vectors" to interrupts weirdly. Lguest has 821 /* Some systems map "vectors" to interrupts weirdly. Not us! */
699 * a straightforward 1 to 1 mapping, so force that here. */
700 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; 822 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
701 if (i != SYSCALL_VECTOR) 823 if (i != SYSCALL_VECTOR)
702 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 824 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
703 } 825 }
704 /* This call is required to set up for 4k stacks, where we have 826
705 * separate stacks for hard and soft interrupts. */ 827 /*
828 * This call is required to set up for 4k stacks, where we have
829 * separate stacks for hard and soft interrupts.
830 */
706 irq_ctx_init(smp_processor_id()); 831 irq_ctx_init(smp_processor_id());
707} 832}
708 833
834/*
835 * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so
836 * rather than set them in lguest_init_IRQ we are called here every time an
837 * lguest device needs an interrupt.
838 *
839 * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should
840 * pass that up!
841 */
709void lguest_setup_irq(unsigned int irq) 842void lguest_setup_irq(unsigned int irq)
710{ 843{
711 irq_to_desc_alloc_node(irq, 0); 844 irq_to_desc_alloc_node(irq, 0);
@@ -724,31 +857,39 @@ static unsigned long lguest_get_wallclock(void)
724 return lguest_data.time.tv_sec; 857 return lguest_data.time.tv_sec;
725} 858}
726 859
727/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us 860/*
861 * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us
728 * what speed it runs at, or 0 if it's unusable as a reliable clock source. 862 * what speed it runs at, or 0 if it's unusable as a reliable clock source.
729 * This matches what we want here: if we return 0 from this function, the x86 863 * This matches what we want here: if we return 0 from this function, the x86
730 * TSC clock will give up and not register itself. */ 864 * TSC clock will give up and not register itself.
865 */
731static unsigned long lguest_tsc_khz(void) 866static unsigned long lguest_tsc_khz(void)
732{ 867{
733 return lguest_data.tsc_khz; 868 return lguest_data.tsc_khz;
734} 869}
735 870
736/* If we can't use the TSC, the kernel falls back to our lower-priority 871/*
737 * "lguest_clock", where we read the time value given to us by the Host. */ 872 * If we can't use the TSC, the kernel falls back to our lower-priority
873 * "lguest_clock", where we read the time value given to us by the Host.
874 */
738static cycle_t lguest_clock_read(struct clocksource *cs) 875static cycle_t lguest_clock_read(struct clocksource *cs)
739{ 876{
740 unsigned long sec, nsec; 877 unsigned long sec, nsec;
741 878
742 /* Since the time is in two parts (seconds and nanoseconds), we risk 879 /*
880 * Since the time is in two parts (seconds and nanoseconds), we risk
743 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, 881 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
744 * and getting 99 and 0. As Linux tends to come apart under the stress 882 * and getting 99 and 0. As Linux tends to come apart under the stress
745 * of time travel, we must be careful: */ 883 * of time travel, we must be careful:
884 */
746 do { 885 do {
747 /* First we read the seconds part. */ 886 /* First we read the seconds part. */
748 sec = lguest_data.time.tv_sec; 887 sec = lguest_data.time.tv_sec;
749 /* This read memory barrier tells the compiler and the CPU that 888 /*
889 * This read memory barrier tells the compiler and the CPU that
750 * this can't be reordered: we have to complete the above 890 * this can't be reordered: we have to complete the above
751 * before going on. */ 891 * before going on.
892 */
752 rmb(); 893 rmb();
753 /* Now we read the nanoseconds part. */ 894 /* Now we read the nanoseconds part. */
754 nsec = lguest_data.time.tv_nsec; 895 nsec = lguest_data.time.tv_nsec;
@@ -772,9 +913,11 @@ static struct clocksource lguest_clock = {
772 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 913 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
773}; 914};
774 915
775/* We also need a "struct clock_event_device": Linux asks us to set it to go 916/*
917 * We also need a "struct clock_event_device": Linux asks us to set it to go
776 * off some time in the future. Actually, James Morris figured all this out, I 918 * off some time in the future. Actually, James Morris figured all this out, I
777 * just applied the patch. */ 919 * just applied the patch.
920 */
778static int lguest_clockevent_set_next_event(unsigned long delta, 921static int lguest_clockevent_set_next_event(unsigned long delta,
779 struct clock_event_device *evt) 922 struct clock_event_device *evt)
780{ 923{
@@ -824,8 +967,10 @@ static struct clock_event_device lguest_clockevent = {
824 .max_delta_ns = LG_CLOCK_MAX_DELTA, 967 .max_delta_ns = LG_CLOCK_MAX_DELTA,
825}; 968};
826 969
827/* This is the Guest timer interrupt handler (hardware interrupt 0). We just 970/*
828 * call the clockevent infrastructure and it does whatever needs doing. */ 971 * This is the Guest timer interrupt handler (hardware interrupt 0). We just
972 * call the clockevent infrastructure and it does whatever needs doing.
973 */
829static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) 974static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
830{ 975{
831 unsigned long flags; 976 unsigned long flags;
@@ -836,10 +981,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
836 local_irq_restore(flags); 981 local_irq_restore(flags);
837} 982}
838 983
839/* At some point in the boot process, we get asked to set up our timing 984/*
985 * At some point in the boot process, we get asked to set up our timing
840 * infrastructure. The kernel doesn't expect timer interrupts before this, but 986 * infrastructure. The kernel doesn't expect timer interrupts before this, but
841 * we cleverly initialized the "blocked_interrupts" field of "struct 987 * we cleverly initialized the "blocked_interrupts" field of "struct
842 * lguest_data" so that timer interrupts were blocked until now. */ 988 * lguest_data" so that timer interrupts were blocked until now.
989 */
843static void lguest_time_init(void) 990static void lguest_time_init(void)
844{ 991{
845 /* Set up the timer interrupt (0) to go to our simple timer routine */ 992 /* Set up the timer interrupt (0) to go to our simple timer routine */
@@ -863,14 +1010,16 @@ static void lguest_time_init(void)
863 * to work. They're pretty simple. 1010 * to work. They're pretty simple.
864 */ 1011 */
865 1012
866/* The Guest needs to tell the Host what stack it expects traps to use. For 1013/*
1014 * The Guest needs to tell the Host what stack it expects traps to use. For
867 * native hardware, this is part of the Task State Segment mentioned above in 1015 * native hardware, this is part of the Task State Segment mentioned above in
868 * lguest_load_tr_desc(), but to help hypervisors there's this special call. 1016 * lguest_load_tr_desc(), but to help hypervisors there's this special call.
869 * 1017 *
870 * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data 1018 * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
871 * segment), the privilege level (we're privilege level 1, the Host is 0 and 1019 * segment), the privilege level (we're privilege level 1, the Host is 0 and
872 * will not tolerate us trying to use that), the stack pointer, and the number 1020 * will not tolerate us trying to use that), the stack pointer, and the number
873 * of pages in the stack. */ 1021 * of pages in the stack.
1022 */
874static void lguest_load_sp0(struct tss_struct *tss, 1023static void lguest_load_sp0(struct tss_struct *tss,
875 struct thread_struct *thread) 1024 struct thread_struct *thread)
876{ 1025{
@@ -884,7 +1033,8 @@ static void lguest_set_debugreg(int regno, unsigned long value)
884 /* FIXME: Implement */ 1033 /* FIXME: Implement */
885} 1034}
886 1035
887/* There are times when the kernel wants to make sure that no memory writes are 1036/*
1037 * There are times when the kernel wants to make sure that no memory writes are
888 * caught in the cache (that they've all reached real hardware devices). This 1038 * caught in the cache (that they've all reached real hardware devices). This
889 * doesn't matter for the Guest which has virtual hardware. 1039 * doesn't matter for the Guest which has virtual hardware.
890 * 1040 *
@@ -898,11 +1048,13 @@ static void lguest_wbinvd(void)
898{ 1048{
899} 1049}
900 1050
901/* If the Guest expects to have an Advanced Programmable Interrupt Controller, 1051/*
1052 * If the Guest expects to have an Advanced Programmable Interrupt Controller,
902 * we play dumb by ignoring writes and returning 0 for reads. So it's no 1053 * we play dumb by ignoring writes and returning 0 for reads. So it's no
903 * longer Programmable nor Controlling anything, and I don't think 8 lines of 1054 * longer Programmable nor Controlling anything, and I don't think 8 lines of
904 * code qualifies for Advanced. It will also never interrupt anything. It 1055 * code qualifies for Advanced. It will also never interrupt anything. It
905 * does, however, allow us to get through the Linux boot code. */ 1056 * does, however, allow us to get through the Linux boot code.
1057 */
906#ifdef CONFIG_X86_LOCAL_APIC 1058#ifdef CONFIG_X86_LOCAL_APIC
907static void lguest_apic_write(u32 reg, u32 v) 1059static void lguest_apic_write(u32 reg, u32 v)
908{ 1060{
@@ -951,11 +1103,13 @@ static void lguest_safe_halt(void)
951 kvm_hypercall0(LHCALL_HALT); 1103 kvm_hypercall0(LHCALL_HALT);
952} 1104}
953 1105
954/* The SHUTDOWN hypercall takes a string to describe what's happening, and 1106/*
1107 * The SHUTDOWN hypercall takes a string to describe what's happening, and
955 * an argument which says whether this to restart (reboot) the Guest or not. 1108 * an argument which says whether this to restart (reboot) the Guest or not.
956 * 1109 *
957 * Note that the Host always prefers that the Guest speak in physical addresses 1110 * Note that the Host always prefers that the Guest speak in physical addresses
958 * rather than virtual addresses, so we use __pa() here. */ 1111 * rather than virtual addresses, so we use __pa() here.
1112 */
959static void lguest_power_off(void) 1113static void lguest_power_off(void)
960{ 1114{
961 kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), 1115 kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"),
@@ -986,8 +1140,10 @@ static __init char *lguest_memory_setup(void)
986 * nice to move it back to lguest_init. Patch welcome... */ 1140 * nice to move it back to lguest_init. Patch welcome... */
987 atomic_notifier_chain_register(&panic_notifier_list, &paniced); 1141 atomic_notifier_chain_register(&panic_notifier_list, &paniced);
988 1142
989 /* The Linux bootloader header contains an "e820" memory map: the 1143 /*
990 * Launcher populated the first entry with our memory limit. */ 1144 *The Linux bootloader header contains an "e820" memory map: the
1145 * Launcher populated the first entry with our memory limit.
1146 */
991 e820_add_region(boot_params.e820_map[0].addr, 1147 e820_add_region(boot_params.e820_map[0].addr,
992 boot_params.e820_map[0].size, 1148 boot_params.e820_map[0].size,
993 boot_params.e820_map[0].type); 1149 boot_params.e820_map[0].type);
@@ -996,16 +1152,17 @@ static __init char *lguest_memory_setup(void)
996 return "LGUEST"; 1152 return "LGUEST";
997} 1153}
998 1154
999/* We will eventually use the virtio console device to produce console output, 1155/*
1156 * We will eventually use the virtio console device to produce console output,
1000 * but before that is set up we use LHCALL_NOTIFY on normal memory to produce 1157 * but before that is set up we use LHCALL_NOTIFY on normal memory to produce
1001 * console output. */ 1158 * console output.
1159 */
1002static __init int early_put_chars(u32 vtermno, const char *buf, int count) 1160static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1003{ 1161{
1004 char scratch[17]; 1162 char scratch[17];
1005 unsigned int len = count; 1163 unsigned int len = count;
1006 1164
1007 /* We use a nul-terminated string, so we have to make a copy. Icky, 1165 /* We use a nul-terminated string, so we make a copy. Icky, huh? */
1008 * huh? */
1009 if (len > sizeof(scratch) - 1) 1166 if (len > sizeof(scratch) - 1)
1010 len = sizeof(scratch) - 1; 1167 len = sizeof(scratch) - 1;
1011 scratch[len] = '\0'; 1168 scratch[len] = '\0';
@@ -1016,8 +1173,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1016 return len; 1173 return len;
1017} 1174}
1018 1175
1019/* Rebooting also tells the Host we're finished, but the RESTART flag tells the 1176/*
1020 * Launcher to reboot us. */ 1177 * Rebooting also tells the Host we're finished, but the RESTART flag tells the
1178 * Launcher to reboot us.
1179 */
1021static void lguest_restart(char *reason) 1180static void lguest_restart(char *reason)
1022{ 1181{
1023 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); 1182 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART);
@@ -1044,7 +1203,8 @@ static void lguest_restart(char *reason)
1044 * fit comfortably. 1203 * fit comfortably.
1045 * 1204 *
1046 * First we need assembly templates of each of the patchable Guest operations, 1205 * First we need assembly templates of each of the patchable Guest operations,
1047 * and these are in i386_head.S. */ 1206 * and these are in i386_head.S.
1207 */
1048 1208
1049/*G:060 We construct a table from the assembler templates: */ 1209/*G:060 We construct a table from the assembler templates: */
1050static const struct lguest_insns 1210static const struct lguest_insns
@@ -1055,9 +1215,11 @@ static const struct lguest_insns
1055 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, 1215 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
1056}; 1216};
1057 1217
1058/* Now our patch routine is fairly simple (based on the native one in 1218/*
1219 * Now our patch routine is fairly simple (based on the native one in
1059 * paravirt.c). If we have a replacement, we copy it in and return how much of 1220 * paravirt.c). If we have a replacement, we copy it in and return how much of
1060 * the available space we used. */ 1221 * the available space we used.
1222 */
1061static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, 1223static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
1062 unsigned long addr, unsigned len) 1224 unsigned long addr, unsigned len)
1063{ 1225{
@@ -1069,8 +1231,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
1069 1231
1070 insn_len = lguest_insns[type].end - lguest_insns[type].start; 1232 insn_len = lguest_insns[type].end - lguest_insns[type].start;
1071 1233
1072 /* Similarly if we can't fit replacement (shouldn't happen, but let's 1234 /* Similarly if it can't fit (doesn't happen, but let's be thorough). */
1073 * be thorough). */
1074 if (len < insn_len) 1235 if (len < insn_len)
1075 return paravirt_patch_default(type, clobber, ibuf, addr, len); 1236 return paravirt_patch_default(type, clobber, ibuf, addr, len);
1076 1237
@@ -1079,22 +1240,28 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
1079 return insn_len; 1240 return insn_len;
1080} 1241}
1081 1242
1082/*G:030 Once we get to lguest_init(), we know we're a Guest. The various 1243/*G:029
1244 * Once we get to lguest_init(), we know we're a Guest. The various
1083 * pv_ops structures in the kernel provide points for (almost) every routine we 1245 * pv_ops structures in the kernel provide points for (almost) every routine we
1084 * have to override to avoid privileged instructions. */ 1246 * have to override to avoid privileged instructions.
1247 */
1085__init void lguest_init(void) 1248__init void lguest_init(void)
1086{ 1249{
1087 /* We're under lguest, paravirt is enabled, and we're running at 1250 /* We're under lguest. */
1088 * privilege level 1, not 0 as normal. */
1089 pv_info.name = "lguest"; 1251 pv_info.name = "lguest";
1252 /* Paravirt is enabled. */
1090 pv_info.paravirt_enabled = 1; 1253 pv_info.paravirt_enabled = 1;
1254 /* We're running at privilege level 1, not 0 as normal. */
1091 pv_info.kernel_rpl = 1; 1255 pv_info.kernel_rpl = 1;
1256 /* Everyone except Xen runs with this set. */
1092 pv_info.shared_kernel_pmd = 1; 1257 pv_info.shared_kernel_pmd = 1;
1093 1258
1094 /* We set up all the lguest overrides for sensitive operations. These 1259 /*
1095 * are detailed with the operations themselves. */ 1260 * We set up all the lguest overrides for sensitive operations. These
1261 * are detailed with the operations themselves.
1262 */
1096 1263
1097 /* interrupt-related operations */ 1264 /* Interrupt-related operations */
1098 pv_irq_ops.init_IRQ = lguest_init_IRQ; 1265 pv_irq_ops.init_IRQ = lguest_init_IRQ;
1099 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); 1266 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
1100 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); 1267 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
@@ -1102,11 +1269,11 @@ __init void lguest_init(void)
1102 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); 1269 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
1103 pv_irq_ops.safe_halt = lguest_safe_halt; 1270 pv_irq_ops.safe_halt = lguest_safe_halt;
1104 1271
1105 /* init-time operations */ 1272 /* Setup operations */
1106 pv_init_ops.memory_setup = lguest_memory_setup; 1273 pv_init_ops.memory_setup = lguest_memory_setup;
1107 pv_init_ops.patch = lguest_patch; 1274 pv_init_ops.patch = lguest_patch;
1108 1275
1109 /* Intercepts of various cpu instructions */ 1276 /* Intercepts of various CPU instructions */
1110 pv_cpu_ops.load_gdt = lguest_load_gdt; 1277 pv_cpu_ops.load_gdt = lguest_load_gdt;
1111 pv_cpu_ops.cpuid = lguest_cpuid; 1278 pv_cpu_ops.cpuid = lguest_cpuid;
1112 pv_cpu_ops.load_idt = lguest_load_idt; 1279 pv_cpu_ops.load_idt = lguest_load_idt;
@@ -1127,7 +1294,7 @@ __init void lguest_init(void)
1127 pv_cpu_ops.start_context_switch = paravirt_start_context_switch; 1294 pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
1128 pv_cpu_ops.end_context_switch = lguest_end_context_switch; 1295 pv_cpu_ops.end_context_switch = lguest_end_context_switch;
1129 1296
1130 /* pagetable management */ 1297 /* Pagetable management */
1131 pv_mmu_ops.write_cr3 = lguest_write_cr3; 1298 pv_mmu_ops.write_cr3 = lguest_write_cr3;
1132 pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; 1299 pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
1133 pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; 1300 pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
@@ -1149,54 +1316,71 @@ __init void lguest_init(void)
1149 pv_mmu_ops.pte_update_defer = lguest_pte_update; 1316 pv_mmu_ops.pte_update_defer = lguest_pte_update;
1150 1317
1151#ifdef CONFIG_X86_LOCAL_APIC 1318#ifdef CONFIG_X86_LOCAL_APIC
1152 /* apic read/write intercepts */ 1319 /* APIC read/write intercepts */
1153 set_lguest_basic_apic_ops(); 1320 set_lguest_basic_apic_ops();
1154#endif 1321#endif
1155 1322
1156 /* time operations */ 1323 /* Time operations */
1157 pv_time_ops.get_wallclock = lguest_get_wallclock; 1324 pv_time_ops.get_wallclock = lguest_get_wallclock;
1158 pv_time_ops.time_init = lguest_time_init; 1325 pv_time_ops.time_init = lguest_time_init;
1159 pv_time_ops.get_tsc_khz = lguest_tsc_khz; 1326 pv_time_ops.get_tsc_khz = lguest_tsc_khz;
1160 1327
1161 /* Now is a good time to look at the implementations of these functions 1328 /*
1162 * before returning to the rest of lguest_init(). */ 1329 * Now is a good time to look at the implementations of these functions
1330 * before returning to the rest of lguest_init().
1331 */
1163 1332
1164 /*G:070 Now we've seen all the paravirt_ops, we return to 1333 /*G:070
1334 * Now we've seen all the paravirt_ops, we return to
1165 * lguest_init() where the rest of the fairly chaotic boot setup 1335 * lguest_init() where the rest of the fairly chaotic boot setup
1166 * occurs. */ 1336 * occurs.
1337 */
1167 1338
1168 /* The stack protector is a weird thing where gcc places a canary 1339 /*
1340 * The stack protector is a weird thing where gcc places a canary
1169 * value on the stack and then checks it on return. This file is 1341 * value on the stack and then checks it on return. This file is
1170 * compiled with -fno-stack-protector it, so we got this far without 1342 * compiled with -fno-stack-protector it, so we got this far without
1171 * problems. The value of the canary is kept at offset 20 from the 1343 * problems. The value of the canary is kept at offset 20 from the
1172 * %gs register, so we need to set that up before calling C functions 1344 * %gs register, so we need to set that up before calling C functions
1173 * in other files. */ 1345 * in other files.
1346 */
1174 setup_stack_canary_segment(0); 1347 setup_stack_canary_segment(0);
1175 /* We could just call load_stack_canary_segment(), but we might as 1348
1176 * call switch_to_new_gdt() which loads the whole table and sets up 1349 /*
1177 * the per-cpu segment descriptor register %fs as well. */ 1350 * We could just call load_stack_canary_segment(), but we might as well
1351 * call switch_to_new_gdt() which loads the whole table and sets up the
1352 * per-cpu segment descriptor register %fs as well.
1353 */
1178 switch_to_new_gdt(0); 1354 switch_to_new_gdt(0);
1179 1355
1180 /* As described in head_32.S, we map the first 128M of memory. */ 1356 /* We actually boot with all memory mapped, but let's say 128MB. */
1181 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1357 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1182 1358
1183 /* The Host<->Guest Switcher lives at the top of our address space, and 1359 /*
1360 * The Host<->Guest Switcher lives at the top of our address space, and
1184 * the Host told us how big it is when we made LGUEST_INIT hypercall: 1361 * the Host told us how big it is when we made LGUEST_INIT hypercall:
1185 * it put the answer in lguest_data.reserve_mem */ 1362 * it put the answer in lguest_data.reserve_mem
1363 */
1186 reserve_top_address(lguest_data.reserve_mem); 1364 reserve_top_address(lguest_data.reserve_mem);
1187 1365
1188 /* If we don't initialize the lock dependency checker now, it crashes 1366 /*
1189 * paravirt_disable_iospace. */ 1367 * If we don't initialize the lock dependency checker now, it crashes
1368 * paravirt_disable_iospace.
1369 */
1190 lockdep_init(); 1370 lockdep_init();
1191 1371
1192 /* The IDE code spends about 3 seconds probing for disks: if we reserve 1372 /*
1373 * The IDE code spends about 3 seconds probing for disks: if we reserve
1193 * all the I/O ports up front it can't get them and so doesn't probe. 1374 * all the I/O ports up front it can't get them and so doesn't probe.
1194 * Other device drivers are similar (but less severe). This cuts the 1375 * Other device drivers are similar (but less severe). This cuts the
1195 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ 1376 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds.
1377 */
1196 paravirt_disable_iospace(); 1378 paravirt_disable_iospace();
1197 1379
1198 /* This is messy CPU setup stuff which the native boot code does before 1380 /*
1199 * start_kernel, so we have to do, too: */ 1381 * This is messy CPU setup stuff which the native boot code does before
1382 * start_kernel, so we have to do, too:
1383 */
1200 cpu_detect(&new_cpu_data); 1384 cpu_detect(&new_cpu_data);
1201 /* head.S usually sets up the first capability word, so do it here. */ 1385 /* head.S usually sets up the first capability word, so do it here. */
1202 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1386 new_cpu_data.x86_capability[0] = cpuid_edx(1);
@@ -1213,22 +1397,28 @@ __init void lguest_init(void)
1213 acpi_ht = 0; 1397 acpi_ht = 0;
1214#endif 1398#endif
1215 1399
1216 /* We set the preferred console to "hvc". This is the "hypervisor 1400 /*
1401 * We set the preferred console to "hvc". This is the "hypervisor
1217 * virtual console" driver written by the PowerPC people, which we also 1402 * virtual console" driver written by the PowerPC people, which we also
1218 * adapted for lguest's use. */ 1403 * adapted for lguest's use.
1404 */
1219 add_preferred_console("hvc", 0, NULL); 1405 add_preferred_console("hvc", 0, NULL);
1220 1406
1221 /* Register our very early console. */ 1407 /* Register our very early console. */
1222 virtio_cons_early_init(early_put_chars); 1408 virtio_cons_early_init(early_put_chars);
1223 1409
1224 /* Last of all, we set the power management poweroff hook to point to 1410 /*
1411 * Last of all, we set the power management poweroff hook to point to
1225 * the Guest routine to power off, and the reboot hook to our restart 1412 * the Guest routine to power off, and the reboot hook to our restart
1226 * routine. */ 1413 * routine.
1414 */
1227 pm_power_off = lguest_power_off; 1415 pm_power_off = lguest_power_off;
1228 machine_ops.restart = lguest_restart; 1416 machine_ops.restart = lguest_restart;
1229 1417
1230 /* Now we're set up, call i386_start_kernel() in head32.c and we proceed 1418 /*
1231 * to boot as normal. It never returns. */ 1419 * Now we're set up, call i386_start_kernel() in head32.c and we proceed
1420 * to boot as normal. It never returns.
1421 */
1232 i386_start_kernel(); 1422 i386_start_kernel();
1233} 1423}
1234/* 1424/*
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index a9c8cfe61cd4..27eac0faee48 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -5,7 +5,8 @@
5#include <asm/thread_info.h> 5#include <asm/thread_info.h>
6#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
7 7
8/*G:020 Our story starts with the kernel booting into startup_32 in 8/*G:020
9 * Our story starts with the kernel booting into startup_32 in
9 * arch/x86/kernel/head_32.S. It expects a boot header, which is created by 10 * arch/x86/kernel/head_32.S. It expects a boot header, which is created by
10 * the bootloader (the Launcher in our case). 11 * the bootloader (the Launcher in our case).
11 * 12 *
@@ -21,11 +22,14 @@
21 * data without remembering to subtract __PAGE_OFFSET! 22 * data without remembering to subtract __PAGE_OFFSET!
22 * 23 *
23 * The .section line puts this code in .init.text so it will be discarded after 24 * The .section line puts this code in .init.text so it will be discarded after
24 * boot. */ 25 * boot.
26 */
25.section .init.text, "ax", @progbits 27.section .init.text, "ax", @progbits
26ENTRY(lguest_entry) 28ENTRY(lguest_entry)
27 /* We make the "initialization" hypercall now to tell the Host about 29 /*
28 * us, and also find out where it put our page tables. */ 30 * We make the "initialization" hypercall now to tell the Host about
31 * us, and also find out where it put our page tables.
32 */
29 movl $LHCALL_LGUEST_INIT, %eax 33 movl $LHCALL_LGUEST_INIT, %eax
30 movl $lguest_data - __PAGE_OFFSET, %ebx 34 movl $lguest_data - __PAGE_OFFSET, %ebx
31 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 35 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
@@ -33,13 +37,14 @@ ENTRY(lguest_entry)
33 /* Set up the initial stack so we can run C code. */ 37 /* Set up the initial stack so we can run C code. */
34 movl $(init_thread_union+THREAD_SIZE),%esp 38 movl $(init_thread_union+THREAD_SIZE),%esp
35 39
36 /* Jumps are relative, and we're running __PAGE_OFFSET too low at the 40 /* Jumps are relative: we're running __PAGE_OFFSET too low. */
37 * moment. */
38 jmp lguest_init+__PAGE_OFFSET 41 jmp lguest_init+__PAGE_OFFSET
39 42
40/*G:055 We create a macro which puts the assembler code between lgstart_ and 43/*G:055
41 * lgend_ markers. These templates are put in the .text section: they can't be 44 * We create a macro which puts the assembler code between lgstart_ and lgend_
42 * discarded after boot as we may need to patch modules, too. */ 45 * markers. These templates are put in the .text section: they can't be
46 * discarded after boot as we may need to patch modules, too.
47 */
43.text 48.text
44#define LGUEST_PATCH(name, insns...) \ 49#define LGUEST_PATCH(name, insns...) \
45 lgstart_##name: insns; lgend_##name:; \ 50 lgstart_##name: insns; lgend_##name:; \
@@ -48,83 +53,103 @@ ENTRY(lguest_entry)
48LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) 53LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
49LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) 54LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
50 55
51/*G:033 But using those wrappers is inefficient (we'll see why that doesn't 56/*G:033
52 * matter for save_fl and irq_disable later). If we write our routines 57 * But using those wrappers is inefficient (we'll see why that doesn't matter
53 * carefully in assembler, we can avoid clobbering any registers and avoid 58 * for save_fl and irq_disable later). If we write our routines carefully in
54 * jumping through the wrapper functions. 59 * assembler, we can avoid clobbering any registers and avoid jumping through
60 * the wrapper functions.
55 * 61 *
56 * I skipped over our first piece of assembler, but this one is worth studying 62 * I skipped over our first piece of assembler, but this one is worth studying
57 * in a bit more detail so I'll describe in easy stages. First, the routine 63 * in a bit more detail so I'll describe in easy stages. First, the routine to
58 * to enable interrupts: */ 64 * enable interrupts:
65 */
59ENTRY(lg_irq_enable) 66ENTRY(lg_irq_enable)
60 /* The reverse of irq_disable, this sets lguest_data.irq_enabled to 67 /*
61 * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ 68 * The reverse of irq_disable, this sets lguest_data.irq_enabled to
69 * X86_EFLAGS_IF (ie. "Interrupts enabled").
70 */
62 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled 71 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
63 /* But now we need to check if the Host wants to know: there might have 72 /*
73 * But now we need to check if the Host wants to know: there might have
64 * been interrupts waiting to be delivered, in which case it will have 74 * been interrupts waiting to be delivered, in which case it will have
65 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we 75 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
66 * jump to send_interrupts, otherwise we're done. */ 76 * jump to send_interrupts, otherwise we're done.
77 */
67 testl $0, lguest_data+LGUEST_DATA_irq_pending 78 testl $0, lguest_data+LGUEST_DATA_irq_pending
68 jnz send_interrupts 79 jnz send_interrupts
69 /* One cool thing about x86 is that you can do many things without using 80 /*
81 * One cool thing about x86 is that you can do many things without using
70 * a register. In this case, the normal path hasn't needed to save or 82 * a register. In this case, the normal path hasn't needed to save or
71 * restore any registers at all! */ 83 * restore any registers at all!
84 */
72 ret 85 ret
73send_interrupts: 86send_interrupts:
74 /* OK, now we need a register: eax is used for the hypercall number, 87 /*
88 * OK, now we need a register: eax is used for the hypercall number,
75 * which is LHCALL_SEND_INTERRUPTS. 89 * which is LHCALL_SEND_INTERRUPTS.
76 * 90 *
77 * We used not to bother with this pending detection at all, which was 91 * We used not to bother with this pending detection at all, which was
78 * much simpler. Sooner or later the Host would realize it had to 92 * much simpler. Sooner or later the Host would realize it had to
79 * send us an interrupt. But that turns out to make performance 7 93 * send us an interrupt. But that turns out to make performance 7
80 * times worse on a simple tcp benchmark. So now we do this the hard 94 * times worse on a simple tcp benchmark. So now we do this the hard
81 * way. */ 95 * way.
96 */
82 pushl %eax 97 pushl %eax
83 movl $LHCALL_SEND_INTERRUPTS, %eax 98 movl $LHCALL_SEND_INTERRUPTS, %eax
84 /* This is a vmcall instruction (same thing that KVM uses). Older 99 /*
100 * This is a vmcall instruction (same thing that KVM uses). Older
85 * assembler versions might not know the "vmcall" instruction, so we 101 * assembler versions might not know the "vmcall" instruction, so we
86 * create one manually here. */ 102 * create one manually here.
103 */
87 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 104 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
105 /* Put eax back the way we found it. */
88 popl %eax 106 popl %eax
89 ret 107 ret
90 108
91/* Finally, the "popf" or "restore flags" routine. The %eax register holds the 109/*
110 * Finally, the "popf" or "restore flags" routine. The %eax register holds the
92 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're 111 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
93 * enabling interrupts again, if it's 0 we're leaving them off. */ 112 * enabling interrupts again, if it's 0 we're leaving them off.
113 */
94ENTRY(lg_restore_fl) 114ENTRY(lg_restore_fl)
95 /* This is just "lguest_data.irq_enabled = flags;" */ 115 /* This is just "lguest_data.irq_enabled = flags;" */
96 movl %eax, lguest_data+LGUEST_DATA_irq_enabled 116 movl %eax, lguest_data+LGUEST_DATA_irq_enabled
97 /* Now, if the %eax value has enabled interrupts and 117 /*
118 * Now, if the %eax value has enabled interrupts and
98 * lguest_data.irq_pending is set, we want to tell the Host so it can 119 * lguest_data.irq_pending is set, we want to tell the Host so it can
99 * deliver any outstanding interrupts. Fortunately, both values will 120 * deliver any outstanding interrupts. Fortunately, both values will
100 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" 121 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
101 * instruction will AND them together for us. If both are set, we 122 * instruction will AND them together for us. If both are set, we
102 * jump to send_interrupts. */ 123 * jump to send_interrupts.
124 */
103 testl lguest_data+LGUEST_DATA_irq_pending, %eax 125 testl lguest_data+LGUEST_DATA_irq_pending, %eax
104 jnz send_interrupts 126 jnz send_interrupts
105 /* Again, the normal path has used no extra registers. Clever, huh? */ 127 /* Again, the normal path has used no extra registers. Clever, huh? */
106 ret 128 ret
129/*:*/
107 130
108/* These demark the EIP range where host should never deliver interrupts. */ 131/* These demark the EIP range where host should never deliver interrupts. */
109.global lguest_noirq_start 132.global lguest_noirq_start
110.global lguest_noirq_end 133.global lguest_noirq_end
111 134
112/*M:004 When the Host reflects a trap or injects an interrupt into the Guest, 135/*M:004
113 * it sets the eflags interrupt bit on the stack based on 136 * When the Host reflects a trap or injects an interrupt into the Guest, it
114 * lguest_data.irq_enabled, so the Guest iret logic does the right thing when 137 * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
115 * restoring it. However, when the Host sets the Guest up for direct traps, 138 * so the Guest iret logic does the right thing when restoring it. However,
116 * such as system calls, the processor is the one to push eflags onto the 139 * when the Host sets the Guest up for direct traps, such as system calls, the
117 * stack, and the interrupt bit will be 1 (in reality, interrupts are always 140 * processor is the one to push eflags onto the stack, and the interrupt bit
118 * enabled in the Guest). 141 * will be 1 (in reality, interrupts are always enabled in the Guest).
119 * 142 *
120 * This turns out to be harmless: the only trap which should happen under Linux 143 * This turns out to be harmless: the only trap which should happen under Linux
121 * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc 144 * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
122 * regions), which has to be reflected through the Host anyway. If another 145 * regions), which has to be reflected through the Host anyway. If another
123 * trap *does* go off when interrupts are disabled, the Guest will panic, and 146 * trap *does* go off when interrupts are disabled, the Guest will panic, and
124 * we'll never get to this iret! :*/ 147 * we'll never get to this iret!
148:*/
125 149
126/*G:045 There is one final paravirt_op that the Guest implements, and glancing 150/*G:045
127 * at it you can see why I left it to last. It's *cool*! It's in *assembler*! 151 * There is one final paravirt_op that the Guest implements, and glancing at it
152 * you can see why I left it to last. It's *cool*! It's in *assembler*!
128 * 153 *
129 * The "iret" instruction is used to return from an interrupt or trap. The 154 * The "iret" instruction is used to return from an interrupt or trap. The
130 * stack looks like this: 155 * stack looks like this:
@@ -148,15 +173,18 @@ ENTRY(lg_restore_fl)
148 * return to userspace or wherever. Our solution to this is to surround the 173 * return to userspace or wherever. Our solution to this is to surround the
149 * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the 174 * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the
150 * Host that it is *never* to interrupt us there, even if interrupts seem to be 175 * Host that it is *never* to interrupt us there, even if interrupts seem to be
151 * enabled. */ 176 * enabled.
177 */
152ENTRY(lguest_iret) 178ENTRY(lguest_iret)
153 pushl %eax 179 pushl %eax
154 movl 12(%esp), %eax 180 movl 12(%esp), %eax
155lguest_noirq_start: 181lguest_noirq_start:
156 /* Note the %ss: segment prefix here. Normal data accesses use the 182 /*
183 * Note the %ss: segment prefix here. Normal data accesses use the
157 * "ds" segment, but that will have already been restored for whatever 184 * "ds" segment, but that will have already been restored for whatever
158 * we're returning to (such as userspace): we can't trust it. The %ss: 185 * we're returning to (such as userspace): we can't trust it. The %ss:
159 * prefix makes sure we use the stack segment, which is still valid. */ 186 * prefix makes sure we use the stack segment, which is still valid.
187 */
160 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled 188 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled
161 popl %eax 189 popl %eax
162 iret 190 iret
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f9d35632666b..07c31899c9c2 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -10,6 +10,7 @@ lib-y += usercopy_$(BITS).o getuser.o putuser.o
10lib-y += memcpy_$(BITS).o 10lib-y += memcpy_$(BITS).o
11 11
12ifeq ($(CONFIG_X86_32),y) 12ifeq ($(CONFIG_X86_32),y)
13 obj-y += atomic64_32.o
13 lib-y += checksum_32.o 14 lib-y += checksum_32.o
14 lib-y += strstr_32.o 15 lib-y += strstr_32.o
15 lib-y += semaphore_32.o string_32.o 16 lib-y += semaphore_32.o string_32.o
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
new file mode 100644
index 000000000000..824fa0be55a3
--- /dev/null
+++ b/arch/x86/lib/atomic64_32.c
@@ -0,0 +1,230 @@
1#include <linux/compiler.h>
2#include <linux/module.h>
3#include <linux/types.h>
4
5#include <asm/processor.h>
6#include <asm/cmpxchg.h>
7#include <asm/atomic.h>
8
9static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new)
10{
11 u32 low = new;
12 u32 high = new >> 32;
13
14 asm volatile(
15 LOCK_PREFIX "cmpxchg8b %1\n"
16 : "+A" (old), "+m" (*ptr)
17 : "b" (low), "c" (high)
18 );
19 return old;
20}
21
22u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val)
23{
24 return cmpxchg8b(&ptr->counter, old_val, new_val);
25}
26EXPORT_SYMBOL(atomic64_cmpxchg);
27
28/**
29 * atomic64_xchg - xchg atomic64 variable
30 * @ptr: pointer to type atomic64_t
31 * @new_val: value to assign
32 *
33 * Atomically xchgs the value of @ptr to @new_val and returns
34 * the old value.
35 */
36u64 atomic64_xchg(atomic64_t *ptr, u64 new_val)
37{
38 /*
39 * Try first with a (possibly incorrect) assumption about
40 * what we have there. We'll do two loops most likely,
41 * but we'll get an ownership MESI transaction straight away
42 * instead of a read transaction followed by a
43 * flush-for-ownership transaction:
44 */
45 u64 old_val, real_val = 0;
46
47 do {
48 old_val = real_val;
49
50 real_val = atomic64_cmpxchg(ptr, old_val, new_val);
51
52 } while (real_val != old_val);
53
54 return old_val;
55}
56EXPORT_SYMBOL(atomic64_xchg);
57
58/**
59 * atomic64_set - set atomic64 variable
60 * @ptr: pointer to type atomic64_t
61 * @new_val: value to assign
62 *
63 * Atomically sets the value of @ptr to @new_val.
64 */
65void atomic64_set(atomic64_t *ptr, u64 new_val)
66{
67 atomic64_xchg(ptr, new_val);
68}
69EXPORT_SYMBOL(atomic64_set);
70
71/**
72EXPORT_SYMBOL(atomic64_read);
73 * atomic64_add_return - add and return
74 * @delta: integer value to add
75 * @ptr: pointer to type atomic64_t
76 *
77 * Atomically adds @delta to @ptr and returns @delta + *@ptr
78 */
79noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr)
80{
81 /*
82 * Try first with a (possibly incorrect) assumption about
83 * what we have there. We'll do two loops most likely,
84 * but we'll get an ownership MESI transaction straight away
85 * instead of a read transaction followed by a
86 * flush-for-ownership transaction:
87 */
88 u64 old_val, new_val, real_val = 0;
89
90 do {
91 old_val = real_val;
92 new_val = old_val + delta;
93
94 real_val = atomic64_cmpxchg(ptr, old_val, new_val);
95
96 } while (real_val != old_val);
97
98 return new_val;
99}
100EXPORT_SYMBOL(atomic64_add_return);
101
102u64 atomic64_sub_return(u64 delta, atomic64_t *ptr)
103{
104 return atomic64_add_return(-delta, ptr);
105}
106EXPORT_SYMBOL(atomic64_sub_return);
107
108u64 atomic64_inc_return(atomic64_t *ptr)
109{
110 return atomic64_add_return(1, ptr);
111}
112EXPORT_SYMBOL(atomic64_inc_return);
113
114u64 atomic64_dec_return(atomic64_t *ptr)
115{
116 return atomic64_sub_return(1, ptr);
117}
118EXPORT_SYMBOL(atomic64_dec_return);
119
120/**
121 * atomic64_add - add integer to atomic64 variable
122 * @delta: integer value to add
123 * @ptr: pointer to type atomic64_t
124 *
125 * Atomically adds @delta to @ptr.
126 */
127void atomic64_add(u64 delta, atomic64_t *ptr)
128{
129 atomic64_add_return(delta, ptr);
130}
131EXPORT_SYMBOL(atomic64_add);
132
133/**
134 * atomic64_sub - subtract the atomic64 variable
135 * @delta: integer value to subtract
136 * @ptr: pointer to type atomic64_t
137 *
138 * Atomically subtracts @delta from @ptr.
139 */
140void atomic64_sub(u64 delta, atomic64_t *ptr)
141{
142 atomic64_add(-delta, ptr);
143}
144EXPORT_SYMBOL(atomic64_sub);
145
146/**
147 * atomic64_sub_and_test - subtract value from variable and test result
148 * @delta: integer value to subtract
149 * @ptr: pointer to type atomic64_t
150 *
151 * Atomically subtracts @delta from @ptr and returns
152 * true if the result is zero, or false for all
153 * other cases.
154 */
155int atomic64_sub_and_test(u64 delta, atomic64_t *ptr)
156{
157 u64 new_val = atomic64_sub_return(delta, ptr);
158
159 return new_val == 0;
160}
161EXPORT_SYMBOL(atomic64_sub_and_test);
162
163/**
164 * atomic64_inc - increment atomic64 variable
165 * @ptr: pointer to type atomic64_t
166 *
167 * Atomically increments @ptr by 1.
168 */
169void atomic64_inc(atomic64_t *ptr)
170{
171 atomic64_add(1, ptr);
172}
173EXPORT_SYMBOL(atomic64_inc);
174
175/**
176 * atomic64_dec - decrement atomic64 variable
177 * @ptr: pointer to type atomic64_t
178 *
179 * Atomically decrements @ptr by 1.
180 */
181void atomic64_dec(atomic64_t *ptr)
182{
183 atomic64_sub(1, ptr);
184}
185EXPORT_SYMBOL(atomic64_dec);
186
187/**
188 * atomic64_dec_and_test - decrement and test
189 * @ptr: pointer to type atomic64_t
190 *
191 * Atomically decrements @ptr by 1 and
192 * returns true if the result is 0, or false for all other
193 * cases.
194 */
195int atomic64_dec_and_test(atomic64_t *ptr)
196{
197 return atomic64_sub_and_test(1, ptr);
198}
199EXPORT_SYMBOL(atomic64_dec_and_test);
200
201/**
202 * atomic64_inc_and_test - increment and test
203 * @ptr: pointer to type atomic64_t
204 *
205 * Atomically increments @ptr by 1
206 * and returns true if the result is zero, or false for all
207 * other cases.
208 */
209int atomic64_inc_and_test(atomic64_t *ptr)
210{
211 return atomic64_sub_and_test(-1, ptr);
212}
213EXPORT_SYMBOL(atomic64_inc_and_test);
214
215/**
216 * atomic64_add_negative - add and test if negative
217 * @delta: integer value to add
218 * @ptr: pointer to type atomic64_t
219 *
220 * Atomically adds @delta to @ptr and returns true
221 * if the result is negative, or false when
222 * result is greater than or equal to zero.
223 */
224int atomic64_add_negative(u64 delta, atomic64_t *ptr)
225{
226 s64 new_val = atomic64_add_return(delta, ptr);
227
228 return new_val < 0;
229}
230EXPORT_SYMBOL(atomic64_add_negative);
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 9a10a78bb4a4..ebeafcce04a9 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -5,15 +5,14 @@
5 * Zero a page. 5 * Zero a page.
6 * rdi page 6 * rdi page
7 */ 7 */
8 ALIGN 8ENTRY(clear_page_c)
9clear_page_c:
10 CFI_STARTPROC 9 CFI_STARTPROC
11 movl $4096/8,%ecx 10 movl $4096/8,%ecx
12 xorl %eax,%eax 11 xorl %eax,%eax
13 rep stosq 12 rep stosq
14 ret 13 ret
15 CFI_ENDPROC 14 CFI_ENDPROC
16ENDPROC(clear_page) 15ENDPROC(clear_page_c)
17 16
18ENTRY(clear_page) 17ENTRY(clear_page)
19 CFI_STARTPROC 18 CFI_STARTPROC
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index f118c110af32..6ba0f7bb85ea 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -75,6 +75,7 @@ ENTRY(copy_to_user)
75 jae bad_to_user 75 jae bad_to_user
76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
77 CFI_ENDPROC 77 CFI_ENDPROC
78ENDPROC(copy_to_user)
78 79
79/* Standard copy_from_user with segment limit checking */ 80/* Standard copy_from_user with segment limit checking */
80ENTRY(copy_from_user) 81ENTRY(copy_from_user)
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 1440b9c0547e..caa24aca8115 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -89,16 +89,13 @@ void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
89 rv.msrs = msrs; 89 rv.msrs = msrs;
90 rv.msr_no = msr_no; 90 rv.msr_no = msr_no;
91 91
92 preempt_disable(); 92 this_cpu = get_cpu();
93 /* 93
94 * FIXME: handle the CPU we're executing on separately for now until 94 if (cpumask_test_cpu(this_cpu, mask))
95 * smp_call_function_many has been fixed to not skip it. 95 __rdmsr_on_cpu(&rv);
96 */
97 this_cpu = raw_smp_processor_id();
98 smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1);
99 96
100 smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1); 97 smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
101 preempt_enable(); 98 put_cpu();
102} 99}
103EXPORT_SYMBOL(rdmsr_on_cpus); 100EXPORT_SYMBOL(rdmsr_on_cpus);
104 101
@@ -121,16 +118,13 @@ void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
121 rv.msrs = msrs; 118 rv.msrs = msrs;
122 rv.msr_no = msr_no; 119 rv.msr_no = msr_no;
123 120
124 preempt_disable(); 121 this_cpu = get_cpu();
125 /* 122
126 * FIXME: handle the CPU we're executing on separately for now until 123 if (cpumask_test_cpu(this_cpu, mask))
127 * smp_call_function_many has been fixed to not skip it. 124 __wrmsr_on_cpu(&rv);
128 */
129 this_cpu = raw_smp_processor_id();
130 smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1);
131 125
132 smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1); 126 smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
133 preempt_enable(); 127 put_cpu();
134} 128}
135EXPORT_SYMBOL(wrmsr_on_cpus); 129EXPORT_SYMBOL(wrmsr_on_cpus);
136 130
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 7c8ca91bb9ec..1f118d462acc 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -751,7 +751,7 @@ survive:
751 751
752 if (retval == -ENOMEM && is_global_init(current)) { 752 if (retval == -ENOMEM && is_global_init(current)) {
753 up_read(&current->mm->mmap_sem); 753 up_read(&current->mm->mmap_sem);
754 congestion_wait(WRITE, HZ/50); 754 congestion_wait(BLK_RW_ASYNC, HZ/50);
755 goto survive; 755 goto survive;
756 } 756 }
757 757
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 78a5fff857be..bfae139182ff 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -426,10 +426,11 @@ static noinline int vmalloc_fault(unsigned long address)
426} 426}
427 427
428static const char errata93_warning[] = 428static const char errata93_warning[] =
429KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" 429KERN_ERR
430KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" 430"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
431KERN_ERR "******* Please consider a BIOS update.\n" 431"******* Working around it, but it may cause SEGVs or burn power.\n"
432KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; 432"******* Please consider a BIOS update.\n"
433"******* Disabling USB legacy in the BIOS may also help.\n";
433 434
434/* 435/*
435 * No vm86 mode in 64-bit mode: 436 * No vm86 mode in 64-bit mode:
@@ -696,7 +697,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
696 if (!printk_ratelimit()) 697 if (!printk_ratelimit())
697 return; 698 return;
698 699
699 printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", 700 printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
700 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 701 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
701 tsk->comm, task_pid_nr(tsk), address, 702 tsk->comm, task_pid_nr(tsk), address,
702 (void *)regs->ip, (void *)regs->sp, error_code); 703 (void *)regs->ip, (void *)regs->sp, error_code);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 58f621e81919..2112ed55e7ea 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -103,6 +103,7 @@ EXPORT_SYMBOL(kmap);
103EXPORT_SYMBOL(kunmap); 103EXPORT_SYMBOL(kunmap);
104EXPORT_SYMBOL(kmap_atomic); 104EXPORT_SYMBOL(kmap_atomic);
105EXPORT_SYMBOL(kunmap_atomic); 105EXPORT_SYMBOL(kunmap_atomic);
106EXPORT_SYMBOL(kmap_atomic_prot);
106 107
107void __init set_highmem_pages_init(void) 108void __init set_highmem_pages_init(void)
108{ 109{
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 47ce9a2ce5e7..0607119cef94 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -12,6 +12,7 @@
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/tlbflush.h> 13#include <asm/tlbflush.h>
14#include <asm/tlb.h> 14#include <asm/tlb.h>
15#include <asm/proto.h>
15 16
16DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 17DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
17 18
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index b177652251a4..6176fe8f29e0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -598,8 +598,15 @@ void __init paging_init(void)
598 598
599 sparse_memory_present_with_active_regions(MAX_NUMNODES); 599 sparse_memory_present_with_active_regions(MAX_NUMNODES);
600 sparse_init(); 600 sparse_init();
601 /* clear the default setting with node 0 */ 601
602 nodes_clear(node_states[N_NORMAL_MEMORY]); 602 /*
603 * clear the default setting with node 0
604 * note: don't use nodes_clear here, that is really clearing when
605 * numa support is not compiled in, and later node_set_state
606 * will not set it back.
607 */
608 node_clear_state(0, N_NORMAL_MEMORY);
609
603 free_area_init_nodes(max_zone_pfns); 610 free_area_init_nodes(max_zone_pfns);
604} 611}
605 612
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index c106f7852424..dce282f65700 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -592,9 +592,12 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
592 unsigned int level; 592 unsigned int level;
593 pte_t *kpte, old_pte; 593 pte_t *kpte, old_pte;
594 594
595 if (cpa->flags & CPA_PAGES_ARRAY) 595 if (cpa->flags & CPA_PAGES_ARRAY) {
596 address = (unsigned long)page_address(cpa->pages[cpa->curpage]); 596 struct page *page = cpa->pages[cpa->curpage];
597 else if (cpa->flags & CPA_ARRAY) 597 if (unlikely(PageHighMem(page)))
598 return 0;
599 address = (unsigned long)page_address(page);
600 } else if (cpa->flags & CPA_ARRAY)
598 address = cpa->vaddr[cpa->curpage]; 601 address = cpa->vaddr[cpa->curpage];
599 else 602 else
600 address = *cpa->vaddr; 603 address = *cpa->vaddr;
@@ -698,9 +701,12 @@ static int cpa_process_alias(struct cpa_data *cpa)
698 * No need to redo, when the primary call touched the direct 701 * No need to redo, when the primary call touched the direct
699 * mapping already: 702 * mapping already:
700 */ 703 */
701 if (cpa->flags & CPA_PAGES_ARRAY) 704 if (cpa->flags & CPA_PAGES_ARRAY) {
702 vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]); 705 struct page *page = cpa->pages[cpa->curpage];
703 else if (cpa->flags & CPA_ARRAY) 706 if (unlikely(PageHighMem(page)))
707 return 0;
708 vaddr = (unsigned long)page_address(page);
709 } else if (cpa->flags & CPA_ARRAY)
704 vaddr = cpa->vaddr[cpa->curpage]; 710 vaddr = cpa->vaddr[cpa->curpage];
705 else 711 else
706 vaddr = *cpa->vaddr; 712 vaddr = *cpa->vaddr;
@@ -998,12 +1004,15 @@ EXPORT_SYMBOL(set_memory_array_uc);
998int _set_memory_wc(unsigned long addr, int numpages) 1004int _set_memory_wc(unsigned long addr, int numpages)
999{ 1005{
1000 int ret; 1006 int ret;
1007 unsigned long addr_copy = addr;
1008
1001 ret = change_page_attr_set(&addr, numpages, 1009 ret = change_page_attr_set(&addr, numpages,
1002 __pgprot(_PAGE_CACHE_UC_MINUS), 0); 1010 __pgprot(_PAGE_CACHE_UC_MINUS), 0);
1003
1004 if (!ret) { 1011 if (!ret) {
1005 ret = change_page_attr_set(&addr, numpages, 1012 ret = change_page_attr_set_clr(&addr_copy, numpages,
1006 __pgprot(_PAGE_CACHE_WC), 0); 1013 __pgprot(_PAGE_CACHE_WC),
1014 __pgprot(_PAGE_CACHE_MASK),
1015 0, 0, NULL);
1007 } 1016 }
1008 return ret; 1017 return ret;
1009} 1018}
@@ -1120,7 +1129,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray)
1120 int free_idx; 1129 int free_idx;
1121 1130
1122 for (i = 0; i < addrinarray; i++) { 1131 for (i = 0; i < addrinarray; i++) {
1123 start = (unsigned long)page_address(pages[i]); 1132 if (PageHighMem(pages[i]))
1133 continue;
1134 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1124 end = start + PAGE_SIZE; 1135 end = start + PAGE_SIZE;
1125 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) 1136 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
1126 goto err_out; 1137 goto err_out;
@@ -1133,7 +1144,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray)
1133err_out: 1144err_out:
1134 free_idx = i; 1145 free_idx = i;
1135 for (i = 0; i < free_idx; i++) { 1146 for (i = 0; i < free_idx; i++) {
1136 start = (unsigned long)page_address(pages[i]); 1147 if (PageHighMem(pages[i]))
1148 continue;
1149 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1137 end = start + PAGE_SIZE; 1150 end = start + PAGE_SIZE;
1138 free_memtype(start, end); 1151 free_memtype(start, end);
1139 } 1152 }
@@ -1162,7 +1175,9 @@ int set_pages_array_wb(struct page **pages, int addrinarray)
1162 return retval; 1175 return retval;
1163 1176
1164 for (i = 0; i < addrinarray; i++) { 1177 for (i = 0; i < addrinarray; i++) {
1165 start = (unsigned long)page_address(pages[i]); 1178 if (PageHighMem(pages[i]))
1179 continue;
1180 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1166 end = start + PAGE_SIZE; 1181 end = start + PAGE_SIZE;
1167 free_memtype(start, end); 1182 free_memtype(start, end);
1168 } 1183 }
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8e43bdd45456..ed34f5e35999 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -25,7 +25,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
25 return pte; 25 return pte;
26} 26}
27 27
28void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) 28void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
29{ 29{
30 pgtable_page_dtor(pte); 30 pgtable_page_dtor(pte);
31 paravirt_release_pte(page_to_pfn(pte)); 31 paravirt_release_pte(page_to_pfn(pte));
@@ -33,14 +33,14 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
33} 33}
34 34
35#if PAGETABLE_LEVELS > 2 35#if PAGETABLE_LEVELS > 2
36void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) 36void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
37{ 37{
38 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); 38 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
39 tlb_remove_page(tlb, virt_to_page(pmd)); 39 tlb_remove_page(tlb, virt_to_page(pmd));
40} 40}
41 41
42#if PAGETABLE_LEVELS > 3 42#if PAGETABLE_LEVELS > 3
43void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 43void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
44{ 44{
45 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 45 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
46 tlb_remove_page(tlb, virt_to_page(pud)); 46 tlb_remove_page(tlb, virt_to_page(pud));
@@ -329,7 +329,6 @@ void __init reserve_top_address(unsigned long reserve)
329 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", 329 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
330 (int)-reserve); 330 (int)-reserve);
331 __FIXADDR_TOP = -reserve - PAGE_SIZE; 331 __FIXADDR_TOP = -reserve - PAGE_SIZE;
332 __VMALLOC_RESERVE += reserve;
333#endif 332#endif
334} 333}
335 334
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 2dfcbf9df2ae..dbb5381f7b3b 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -79,8 +79,10 @@ static __init void bad_srat(void)
79 acpi_numa = -1; 79 acpi_numa = -1;
80 for (i = 0; i < MAX_LOCAL_APIC; i++) 80 for (i = 0; i < MAX_LOCAL_APIC; i++)
81 apicid_to_node[i] = NUMA_NO_NODE; 81 apicid_to_node[i] = NUMA_NO_NODE;
82 for (i = 0; i < MAX_NUMNODES; i++) 82 for (i = 0; i < MAX_NUMNODES; i++) {
83 nodes_add[i].start = nodes[i].end = 0; 83 nodes[i].start = nodes[i].end = 0;
84 nodes_add[i].start = nodes_add[i].end = 0;
85 }
84 remove_all_active_ranges(); 86 remove_all_active_ranges();
85} 87}
86 88
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index b07dd8d0b321..89b9a5cd63da 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -390,7 +390,7 @@ static int __init p4_init(char **cpu_type)
390static int force_arch_perfmon; 390static int force_arch_perfmon;
391static int force_cpu_type(const char *str, struct kernel_param *kp) 391static int force_cpu_type(const char *str, struct kernel_param *kp)
392{ 392{
393 if (!strcmp(str, "archperfmon")) { 393 if (!strcmp(str, "arch_perfmon")) {
394 force_arch_perfmon = 1; 394 force_arch_perfmon = 1;
395 printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); 395 printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
396 } 396 }
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index b26626dc517c..1014eb4bfc37 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -68,6 +68,10 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
68 unsigned long flags; 68 unsigned long flags;
69 struct resource *root; 69 struct resource *root;
70 int max_root_bus_resources = PCI_BUS_NUM_RESOURCES; 70 int max_root_bus_resources = PCI_BUS_NUM_RESOURCES;
71 u64 start, end;
72
73 if (bus_has_transparent_bridge(info->bus))
74 max_root_bus_resources -= 3;
71 75
72 status = resource_to_addr(acpi_res, &addr); 76 status = resource_to_addr(acpi_res, &addr);
73 if (!ACPI_SUCCESS(status)) 77 if (!ACPI_SUCCESS(status))
@@ -84,25 +88,24 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
84 } else 88 } else
85 return AE_OK; 89 return AE_OK;
86 90
87 res = &info->res[info->res_num]; 91 start = addr.minimum + addr.translation_offset;
88 res->name = info->name; 92 end = start + addr.address_length - 1;
89 res->flags = flags;
90 res->start = addr.minimum + addr.translation_offset;
91 res->end = res->start + addr.address_length - 1;
92 res->child = NULL;
93
94 if (bus_has_transparent_bridge(info->bus))
95 max_root_bus_resources -= 3;
96 if (info->res_num >= max_root_bus_resources) { 93 if (info->res_num >= max_root_bus_resources) {
97 printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx " 94 printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx "
98 "from %s for %s due to _CRS returning more than " 95 "from %s for %s due to _CRS returning more than "
99 "%d resource descriptors\n", (unsigned long) res->start, 96 "%d resource descriptors\n", (unsigned long) start,
100 (unsigned long) res->end, root->name, info->name, 97 (unsigned long) end, root->name, info->name,
101 max_root_bus_resources); 98 max_root_bus_resources);
102 info->res_num++;
103 return AE_OK; 99 return AE_OK;
104 } 100 }
105 101
102 res = &info->res[info->res_num];
103 res->name = info->name;
104 res->flags = flags;
105 res->start = start;
106 res->end = end;
107 res->child = NULL;
108
106 if (insert_resource(root, res)) { 109 if (insert_resource(root, res)) {
107 printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx " 110 printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx "
108 "from %s for %s\n", (unsigned long) res->start, 111 "from %s for %s\n", (unsigned long) res->start,
@@ -115,23 +118,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
115} 118}
116 119
117static void 120static void
118adjust_transparent_bridge_resources(struct pci_bus *bus)
119{
120 struct pci_dev *dev;
121
122 list_for_each_entry(dev, &bus->devices, bus_list) {
123 int i;
124 u16 class = dev->class >> 8;
125
126 if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent) {
127 for(i = 3; i < PCI_BUS_NUM_RESOURCES; i++)
128 dev->subordinate->resource[i] =
129 dev->bus->resource[i - 3];
130 }
131 }
132}
133
134static void
135get_current_resources(struct acpi_device *device, int busnum, 121get_current_resources(struct acpi_device *device, int busnum,
136 int domain, struct pci_bus *bus) 122 int domain, struct pci_bus *bus)
137{ 123{
@@ -158,8 +144,6 @@ get_current_resources(struct acpi_device *device, int busnum,
158 info.res_num = 0; 144 info.res_num = 0;
159 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, 145 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
160 &info); 146 &info);
161 if (info.res_num)
162 adjust_transparent_bridge_resources(bus);
163 147
164 return; 148 return;
165 149
@@ -222,8 +206,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
222 */ 206 */
223 memcpy(bus->sysdata, sd, sizeof(*sd)); 207 memcpy(bus->sysdata, sd, sizeof(*sd));
224 kfree(sd); 208 kfree(sd);
225 } else 209 } else {
226 bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); 210 bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd);
211 if (bus) {
212 if (pci_probe & PCI_USE__CRS)
213 get_current_resources(device, busnum, domain,
214 bus);
215 bus->subordinate = pci_scan_child_bus(bus);
216 }
217 }
227 218
228 if (!bus) 219 if (!bus)
229 kfree(sd); 220 kfree(sd);
@@ -238,8 +229,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
238#endif 229#endif
239 } 230 }
240 231
241 if (bus && (pci_probe & PCI_USE__CRS))
242 get_current_resources(device, busnum, domain, bus);
243 return bus; 232 return bus;
244} 233}
245 234
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index f893d6a6e803..3ffa10df20b9 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -100,8 +100,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
100 int j; 100 int j;
101 struct pci_root_info *info; 101 struct pci_root_info *info;
102 102
103 /* don't go for it if _CRS is used */ 103 /* don't go for it if _CRS is used already */
104 if (pci_probe & PCI_USE__CRS) 104 if (b->resource[0] != &ioport_resource ||
105 b->resource[1] != &iomem_resource)
105 return; 106 return;
106 107
107 /* if only one root bus, don't need to anything */ 108 /* if only one root bus, don't need to anything */
@@ -116,6 +117,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
116 if (i == pci_root_num) 117 if (i == pci_root_num)
117 return; 118 return;
118 119
120 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
121 b->number);
122
119 info = &pci_root_info[i]; 123 info = &pci_root_info[i];
120 for (j = 0; j < info->res_num; j++) { 124 for (j = 0; j < info->res_num; j++) {
121 struct resource *res; 125 struct resource *res;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 0fb56db16d18..52e62e57fedd 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -35,6 +35,7 @@
35#include <asm/pat.h> 35#include <asm/pat.h>
36#include <asm/e820.h> 36#include <asm/e820.h>
37#include <asm/pci_x86.h> 37#include <asm/pci_x86.h>
38#include <asm/io_apic.h>
38 39
39 40
40static int 41static int
@@ -227,6 +228,12 @@ void __init pcibios_resource_survey(void)
227 pcibios_allocate_resources(1); 228 pcibios_allocate_resources(1);
228 229
229 e820_reserve_resources_late(); 230 e820_reserve_resources_late();
231 /*
232 * Insert the IO APIC resources after PCI initialization has
233 * occured to handle IO APICS that are mapped in on a BAR in
234 * PCI space, but before trying to assign unassigned pci res.
235 */
236 ioapic_insert_resources();
230} 237}
231 238
232/** 239/**
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index de2abbd07544..a6a198c33623 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -1,7 +1,7 @@
1# __restore_processor_state() restores %gs after S3 resume and so should not 1# __restore_processor_state() restores %gs after S3 resume and so should not
2# itself be stack-protected 2# itself be stack-protected
3nostackp := $(call cc-option, -fno-stack-protector) 3nostackp := $(call cc-option, -fno-stack-protector)
4CFLAGS_cpu_$(BITS).o := $(nostackp) 4CFLAGS_cpu.o := $(nostackp)
5 5
6obj-$(CONFIG_PM_SLEEP) += cpu.o 6obj-$(CONFIG_PM_SLEEP) += cpu.o
7obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o 7obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o