aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorDavid Woodhouse <David.Woodhouse@intel.com>2009-08-08 06:25:28 -0400
committerDavid Woodhouse <David.Woodhouse@intel.com>2009-08-08 06:26:15 -0400
commita131bc185528331451a93db6c50a7d2070376a61 (patch)
tree18cccd206d4835ee8df147ac3b0c0e30cc00680d /arch/x86
parent19943b0e30b05d42e494ae6fef78156ebc8c637e (diff)
parentff1649ff780fb7c0bfbf42d05ffc9b56336b9aa3 (diff)
Merge branch 'master' of /pub/scm/linux/kernel/git/torvalds/linux-2.6
Pull fixes in from 2.6.31 so that people testing the iommu-2.6.git tree no longer trip over bugs which were already fixed (sorry, Horms).
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/boot/video-bios.c3
-rw-r--r--arch/x86/boot/video-vesa.c4
-rw-r--r--arch/x86/include/asm/atomic_32.h185
-rw-r--r--arch/x86/include/asm/atomic_64.h42
-rw-r--r--arch/x86/include/asm/efi.h5
-rw-r--r--arch/x86/include/asm/fixmap.h10
-rw-r--r--arch/x86/include/asm/io_apic.h2
-rw-r--r--arch/x86/include/asm/irqflags.h8
-rw-r--r--arch/x86/include/asm/lguest.h3
-rw-r--r--arch/x86/include/asm/lguest_hcall.h18
-rw-r--r--arch/x86/include/asm/msr-index.h4
-rw-r--r--arch/x86/include/asm/nmi.h1
-rw-r--r--arch/x86/include/asm/pgalloc.h25
-rw-r--r--arch/x86/include/asm/spinlock.h4
-rw-r--r--arch/x86/include/asm/stacktrace.h2
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/include/asm/uaccess.h4
-rw-r--r--arch/x86/include/asm/uaccess_64.h10
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h9
-rw-r--r--arch/x86/kernel/amd_iommu.c4
-rw-r--r--arch/x86/kernel/amd_iommu_init.c13
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/es7000_32.c3
-rw-r--r--arch/x86/kernel/apic/io_apic.c47
-rw-r--r--arch/x86/kernel/apic/numaq_32.c3
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c38
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c32
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c18
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c261
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c5
-rw-r--r--arch/x86/kernel/dumpstack_32.c6
-rw-r--r--arch/x86/kernel/dumpstack_64.c22
-rw-r--r--arch/x86/kernel/e820.c7
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/efi_64.c6
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/mfgpt_32.c2
-rw-r--r--arch/x86/kernel/pci-gart_64.c2
-rw-r--r--arch/x86/kernel/pvclock.c2
-rw-r--r--arch/x86/kernel/reboot.c42
-rw-r--r--arch/x86/kernel/setup.c13
-rw-r--r--arch/x86/kernel/vmlinux.lds.S23
-rw-r--r--arch/x86/lguest/boot.c510
-rw-r--r--arch/x86/lguest/i386_head.S112
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/atomic64_32.c230
-rw-r--r--arch/x86/lib/clear_page_64.S5
-rw-r--r--arch/x86/lib/copy_user_64.S1
-rw-r--r--arch/x86/lib/msr.c26
-rw-r--r--arch/x86/lib/usercopy_32.c2
-rw-r--r--arch/x86/mm/fault.c11
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--arch/x86/mm/init.c1
-rw-r--r--arch/x86/mm/init_64.c11
-rw-r--r--arch/x86/mm/pageattr.c39
-rw-r--r--arch/x86/mm/pgtable.c7
-rw-r--r--arch/x86/mm/srat_64.c6
-rw-r--r--arch/x86/oprofile/nmi_int.c2
-rw-r--r--arch/x86/pci/acpi.c59
-rw-r--r--arch/x86/pci/amd_bus.c8
-rw-r--r--arch/x86/pci/i386.c7
-rw-r--r--arch/x86/power/Makefile2
69 files changed, 1328 insertions, 628 deletions
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index d660be492363..49e0c18833e0 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -37,14 +37,13 @@ static int set_bios_mode(u8 mode)
37 ireg.al = mode; /* AH=0x00 Set Video Mode */ 37 ireg.al = mode; /* AH=0x00 Set Video Mode */
38 intcall(0x10, &ireg, NULL); 38 intcall(0x10, &ireg, NULL);
39 39
40
41 ireg.ah = 0x0f; /* Get Current Video Mode */ 40 ireg.ah = 0x0f; /* Get Current Video Mode */
42 intcall(0x10, &ireg, &oreg); 41 intcall(0x10, &ireg, &oreg);
43 42
44 do_restore = 1; /* Assume video contents were lost */ 43 do_restore = 1; /* Assume video contents were lost */
45 44
46 /* Not all BIOSes are clean with the top bit */ 45 /* Not all BIOSes are clean with the top bit */
47 new_mode = ireg.al & 0x7f; 46 new_mode = oreg.al & 0x7f;
48 47
49 if (new_mode == mode) 48 if (new_mode == mode)
50 return 0; /* Mode change OK */ 49 return 0; /* Mode change OK */
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index c700147d6ffb..275dd177f198 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -45,7 +45,7 @@ static int vesa_probe(void)
45 ireg.di = (size_t)&vginfo; 45 ireg.di = (size_t)&vginfo;
46 intcall(0x10, &ireg, &oreg); 46 intcall(0x10, &ireg, &oreg);
47 47
48 if (ireg.ax != 0x004f || 48 if (oreg.ax != 0x004f ||
49 vginfo.signature != VESA_MAGIC || 49 vginfo.signature != VESA_MAGIC ||
50 vginfo.version < 0x0102) 50 vginfo.version < 0x0102)
51 return 0; /* Not present */ 51 return 0; /* Not present */
@@ -70,7 +70,7 @@ static int vesa_probe(void)
70 ireg.di = (size_t)&vminfo; 70 ireg.di = (size_t)&vminfo;
71 intcall(0x10, &ireg, &oreg); 71 intcall(0x10, &ireg, &oreg);
72 72
73 if (ireg.ax != 0x004f) 73 if (oreg.ax != 0x004f)
74 continue; 74 continue;
75 75
76 if ((vminfo.mode_attr & 0x15) == 0x05) { 76 if ((vminfo.mode_attr & 0x15) == 0x05) {
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 2503d4e64c2a..dc5a667ff791 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -19,7 +19,10 @@
19 * 19 *
20 * Atomically reads the value of @v. 20 * Atomically reads the value of @v.
21 */ 21 */
22#define atomic_read(v) ((v)->counter) 22static inline int atomic_read(const atomic_t *v)
23{
24 return v->counter;
25}
23 26
24/** 27/**
25 * atomic_set - set atomic variable 28 * atomic_set - set atomic variable
@@ -28,7 +31,10 @@
28 * 31 *
29 * Atomically sets the value of @v to @i. 32 * Atomically sets the value of @v to @i.
30 */ 33 */
31#define atomic_set(v, i) (((v)->counter) = (i)) 34static inline void atomic_set(atomic_t *v, int i)
35{
36 v->counter = i;
37}
32 38
33/** 39/**
34 * atomic_add - add integer to atomic variable 40 * atomic_add - add integer to atomic variable
@@ -200,8 +206,15 @@ static inline int atomic_sub_return(int i, atomic_t *v)
200 return atomic_add_return(-i, v); 206 return atomic_add_return(-i, v);
201} 207}
202 208
203#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) 209static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
204#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) 210{
211 return cmpxchg(&v->counter, old, new);
212}
213
214static inline int atomic_xchg(atomic_t *v, int new)
215{
216 return xchg(&v->counter, new);
217}
205 218
206/** 219/**
207 * atomic_add_unless - add unless the number is already a given value 220 * atomic_add_unless - add unless the number is already a given value
@@ -250,45 +263,12 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
250/* An 64bit atomic type */ 263/* An 64bit atomic type */
251 264
252typedef struct { 265typedef struct {
253 unsigned long long counter; 266 u64 __aligned(8) counter;
254} atomic64_t; 267} atomic64_t;
255 268
256#define ATOMIC64_INIT(val) { (val) } 269#define ATOMIC64_INIT(val) { (val) }
257 270
258/** 271extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
259 * atomic64_read - read atomic64 variable
260 * @ptr: pointer of type atomic64_t
261 *
262 * Atomically reads the value of @v.
263 * Doesn't imply a read memory barrier.
264 */
265#define __atomic64_read(ptr) ((ptr)->counter)
266
267static inline unsigned long long
268cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
269{
270 asm volatile(
271
272 LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
273
274 : "=A" (old)
275
276 : [ptr] "D" (ptr),
277 "A" (old),
278 "b" (ll_low(new)),
279 "c" (ll_high(new))
280
281 : "memory");
282
283 return old;
284}
285
286static inline unsigned long long
287atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
288 unsigned long long new_val)
289{
290 return cmpxchg8b(&ptr->counter, old_val, new_val);
291}
292 272
293/** 273/**
294 * atomic64_xchg - xchg atomic64 variable 274 * atomic64_xchg - xchg atomic64 variable
@@ -298,18 +278,7 @@ atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
298 * Atomically xchgs the value of @ptr to @new_val and returns 278 * Atomically xchgs the value of @ptr to @new_val and returns
299 * the old value. 279 * the old value.
300 */ 280 */
301 281extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
302static inline unsigned long long
303atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
304{
305 unsigned long long old_val;
306
307 do {
308 old_val = atomic_read(ptr);
309 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
310
311 return old_val;
312}
313 282
314/** 283/**
315 * atomic64_set - set atomic64 variable 284 * atomic64_set - set atomic64 variable
@@ -318,10 +287,7 @@ atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
318 * 287 *
319 * Atomically sets the value of @ptr to @new_val. 288 * Atomically sets the value of @ptr to @new_val.
320 */ 289 */
321static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) 290extern void atomic64_set(atomic64_t *ptr, u64 new_val);
322{
323 atomic64_xchg(ptr, new_val);
324}
325 291
326/** 292/**
327 * atomic64_read - read atomic64 variable 293 * atomic64_read - read atomic64 variable
@@ -329,17 +295,30 @@ static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
329 * 295 *
330 * Atomically reads the value of @ptr and returns it. 296 * Atomically reads the value of @ptr and returns it.
331 */ 297 */
332static inline unsigned long long atomic64_read(atomic64_t *ptr) 298static inline u64 atomic64_read(atomic64_t *ptr)
333{ 299{
334 unsigned long long curr_val; 300 u64 res;
335 301
336 do { 302 /*
337 curr_val = __atomic64_read(ptr); 303 * Note, we inline this atomic64_t primitive because
338 } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); 304 * it only clobbers EAX/EDX and leaves the others
339 305 * untouched. We also (somewhat subtly) rely on the
340 return curr_val; 306 * fact that cmpxchg8b returns the current 64-bit value
307 * of the memory location we are touching:
308 */
309 asm volatile(
310 "mov %%ebx, %%eax\n\t"
311 "mov %%ecx, %%edx\n\t"
312 LOCK_PREFIX "cmpxchg8b %1\n"
313 : "=&A" (res)
314 : "m" (*ptr)
315 );
316
317 return res;
341} 318}
342 319
320extern u64 atomic64_read(atomic64_t *ptr);
321
343/** 322/**
344 * atomic64_add_return - add and return 323 * atomic64_add_return - add and return
345 * @delta: integer value to add 324 * @delta: integer value to add
@@ -347,34 +326,14 @@ static inline unsigned long long atomic64_read(atomic64_t *ptr)
347 * 326 *
348 * Atomically adds @delta to @ptr and returns @delta + *@ptr 327 * Atomically adds @delta to @ptr and returns @delta + *@ptr
349 */ 328 */
350static inline unsigned long long 329extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
351atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
352{
353 unsigned long long old_val, new_val;
354
355 do {
356 old_val = atomic_read(ptr);
357 new_val = old_val + delta;
358
359 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
360
361 return new_val;
362}
363
364static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
365{
366 return atomic64_add_return(-delta, ptr);
367}
368 330
369static inline long atomic64_inc_return(atomic64_t *ptr) 331/*
370{ 332 * Other variants with different arithmetic operators:
371 return atomic64_add_return(1, ptr); 333 */
372} 334extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
373 335extern u64 atomic64_inc_return(atomic64_t *ptr);
374static inline long atomic64_dec_return(atomic64_t *ptr) 336extern u64 atomic64_dec_return(atomic64_t *ptr);
375{
376 return atomic64_sub_return(1, ptr);
377}
378 337
379/** 338/**
380 * atomic64_add - add integer to atomic64 variable 339 * atomic64_add - add integer to atomic64 variable
@@ -383,10 +342,7 @@ static inline long atomic64_dec_return(atomic64_t *ptr)
383 * 342 *
384 * Atomically adds @delta to @ptr. 343 * Atomically adds @delta to @ptr.
385 */ 344 */
386static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) 345extern void atomic64_add(u64 delta, atomic64_t *ptr);
387{
388 atomic64_add_return(delta, ptr);
389}
390 346
391/** 347/**
392 * atomic64_sub - subtract the atomic64 variable 348 * atomic64_sub - subtract the atomic64 variable
@@ -395,10 +351,7 @@ static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
395 * 351 *
396 * Atomically subtracts @delta from @ptr. 352 * Atomically subtracts @delta from @ptr.
397 */ 353 */
398static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) 354extern void atomic64_sub(u64 delta, atomic64_t *ptr);
399{
400 atomic64_add(-delta, ptr);
401}
402 355
403/** 356/**
404 * atomic64_sub_and_test - subtract value from variable and test result 357 * atomic64_sub_and_test - subtract value from variable and test result
@@ -409,13 +362,7 @@ static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
409 * true if the result is zero, or false for all 362 * true if the result is zero, or false for all
410 * other cases. 363 * other cases.
411 */ 364 */
412static inline int 365extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
413atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
414{
415 unsigned long long old_val = atomic64_sub_return(delta, ptr);
416
417 return old_val == 0;
418}
419 366
420/** 367/**
421 * atomic64_inc - increment atomic64 variable 368 * atomic64_inc - increment atomic64 variable
@@ -423,10 +370,7 @@ atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
423 * 370 *
424 * Atomically increments @ptr by 1. 371 * Atomically increments @ptr by 1.
425 */ 372 */
426static inline void atomic64_inc(atomic64_t *ptr) 373extern void atomic64_inc(atomic64_t *ptr);
427{
428 atomic64_add(1, ptr);
429}
430 374
431/** 375/**
432 * atomic64_dec - decrement atomic64 variable 376 * atomic64_dec - decrement atomic64 variable
@@ -434,10 +378,7 @@ static inline void atomic64_inc(atomic64_t *ptr)
434 * 378 *
435 * Atomically decrements @ptr by 1. 379 * Atomically decrements @ptr by 1.
436 */ 380 */
437static inline void atomic64_dec(atomic64_t *ptr) 381extern void atomic64_dec(atomic64_t *ptr);
438{
439 atomic64_sub(1, ptr);
440}
441 382
442/** 383/**
443 * atomic64_dec_and_test - decrement and test 384 * atomic64_dec_and_test - decrement and test
@@ -447,10 +388,7 @@ static inline void atomic64_dec(atomic64_t *ptr)
447 * returns true if the result is 0, or false for all other 388 * returns true if the result is 0, or false for all other
448 * cases. 389 * cases.
449 */ 390 */
450static inline int atomic64_dec_and_test(atomic64_t *ptr) 391extern int atomic64_dec_and_test(atomic64_t *ptr);
451{
452 return atomic64_sub_and_test(1, ptr);
453}
454 392
455/** 393/**
456 * atomic64_inc_and_test - increment and test 394 * atomic64_inc_and_test - increment and test
@@ -460,10 +398,7 @@ static inline int atomic64_dec_and_test(atomic64_t *ptr)
460 * and returns true if the result is zero, or false for all 398 * and returns true if the result is zero, or false for all
461 * other cases. 399 * other cases.
462 */ 400 */
463static inline int atomic64_inc_and_test(atomic64_t *ptr) 401extern int atomic64_inc_and_test(atomic64_t *ptr);
464{
465 return atomic64_sub_and_test(-1, ptr);
466}
467 402
468/** 403/**
469 * atomic64_add_negative - add and test if negative 404 * atomic64_add_negative - add and test if negative
@@ -474,13 +409,7 @@ static inline int atomic64_inc_and_test(atomic64_t *ptr)
474 * if the result is negative, or false when 409 * if the result is negative, or false when
475 * result is greater than or equal to zero. 410 * result is greater than or equal to zero.
476 */ 411 */
477static inline int 412extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
478atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
479{
480 long long old_val = atomic64_add_return(delta, ptr);
481
482 return old_val < 0;
483}
484 413
485#include <asm-generic/atomic-long.h> 414#include <asm-generic/atomic-long.h>
486#endif /* _ASM_X86_ATOMIC_32_H */ 415#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
index 0d6360220007..d605dc268e79 100644
--- a/arch/x86/include/asm/atomic_64.h
+++ b/arch/x86/include/asm/atomic_64.h
@@ -18,7 +18,10 @@
18 * 18 *
19 * Atomically reads the value of @v. 19 * Atomically reads the value of @v.
20 */ 20 */
21#define atomic_read(v) ((v)->counter) 21static inline int atomic_read(const atomic_t *v)
22{
23 return v->counter;
24}
22 25
23/** 26/**
24 * atomic_set - set atomic variable 27 * atomic_set - set atomic variable
@@ -27,7 +30,10 @@
27 * 30 *
28 * Atomically sets the value of @v to @i. 31 * Atomically sets the value of @v to @i.
29 */ 32 */
30#define atomic_set(v, i) (((v)->counter) = (i)) 33static inline void atomic_set(atomic_t *v, int i)
34{
35 v->counter = i;
36}
31 37
32/** 38/**
33 * atomic_add - add integer to atomic variable 39 * atomic_add - add integer to atomic variable
@@ -192,7 +198,10 @@ static inline int atomic_sub_return(int i, atomic_t *v)
192 * Atomically reads the value of @v. 198 * Atomically reads the value of @v.
193 * Doesn't imply a read memory barrier. 199 * Doesn't imply a read memory barrier.
194 */ 200 */
195#define atomic64_read(v) ((v)->counter) 201static inline long atomic64_read(const atomic64_t *v)
202{
203 return v->counter;
204}
196 205
197/** 206/**
198 * atomic64_set - set atomic64 variable 207 * atomic64_set - set atomic64 variable
@@ -201,7 +210,10 @@ static inline int atomic_sub_return(int i, atomic_t *v)
201 * 210 *
202 * Atomically sets the value of @v to @i. 211 * Atomically sets the value of @v to @i.
203 */ 212 */
204#define atomic64_set(v, i) (((v)->counter) = (i)) 213static inline void atomic64_set(atomic64_t *v, long i)
214{
215 v->counter = i;
216}
205 217
206/** 218/**
207 * atomic64_add - add integer to atomic64 variable 219 * atomic64_add - add integer to atomic64 variable
@@ -355,11 +367,25 @@ static inline long atomic64_sub_return(long i, atomic64_t *v)
355#define atomic64_inc_return(v) (atomic64_add_return(1, (v))) 367#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
356#define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) 368#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
357 369
358#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) 370static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
359#define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) 371{
372 return cmpxchg(&v->counter, old, new);
373}
374
375static inline long atomic64_xchg(atomic64_t *v, long new)
376{
377 return xchg(&v->counter, new);
378}
360 379
361#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) 380static inline long atomic_cmpxchg(atomic_t *v, int old, int new)
362#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) 381{
382 return cmpxchg(&v->counter, old, new);
383}
384
385static inline long atomic_xchg(atomic_t *v, int new)
386{
387 return xchg(&v->counter, new);
388}
363 389
364/** 390/**
365 * atomic_add_unless - add unless the number is a given value 391 * atomic_add_unless - add unless the number is a given value
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index edc90f23e708..8406ed7f9926 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -33,7 +33,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
33#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ 33#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
34 efi_call_virt(f, a1, a2, a3, a4, a5, a6) 34 efi_call_virt(f, a1, a2, a3, a4, a5, a6)
35 35
36#define efi_ioremap(addr, size) ioremap_cache(addr, size) 36#define efi_ioremap(addr, size, type) ioremap_cache(addr, size)
37 37
38#else /* !CONFIG_X86_32 */ 38#else /* !CONFIG_X86_32 */
39 39
@@ -84,7 +84,8 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
84 efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ 84 efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
85 (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) 85 (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
86 86
87extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size); 87extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
88 u32 type);
88 89
89#endif /* CONFIG_X86_32 */ 90#endif /* CONFIG_X86_32 */
90 91
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 2d81af3974a0..7b2d71df39a6 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -111,12 +111,9 @@ enum fixed_addresses {
111#ifdef CONFIG_PARAVIRT 111#ifdef CONFIG_PARAVIRT
112 FIX_PARAVIRT_BOOTMAP, 112 FIX_PARAVIRT_BOOTMAP,
113#endif 113#endif
114 FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */ 114 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
115 FIX_TEXT_POKE1, 115 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
116 __end_of_permanent_fixed_addresses, 116 __end_of_permanent_fixed_addresses,
117#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
118 FIX_OHCI1394_BASE,
119#endif
120 /* 117 /*
121 * 256 temporary boot-time mappings, used by early_ioremap(), 118 * 256 temporary boot-time mappings, used by early_ioremap(),
122 * before ioremap() is functional. 119 * before ioremap() is functional.
@@ -129,6 +126,9 @@ enum fixed_addresses {
129 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - 126 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
130 (__end_of_permanent_fixed_addresses & 255), 127 (__end_of_permanent_fixed_addresses & 255),
131 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, 128 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
129#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
130 FIX_OHCI1394_BASE,
131#endif
132#ifdef CONFIG_X86_32 132#ifdef CONFIG_X86_32
133 FIX_WP_TEST, 133 FIX_WP_TEST,
134#endif 134#endif
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index daf866ed0612..330ee807f89e 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -161,6 +161,7 @@ extern int io_apic_set_pci_routing(struct device *dev, int irq,
161 struct io_apic_irq_attr *irq_attr); 161 struct io_apic_irq_attr *irq_attr);
162extern int (*ioapic_renumber_irq)(int ioapic, int irq); 162extern int (*ioapic_renumber_irq)(int ioapic, int irq);
163extern void ioapic_init_mappings(void); 163extern void ioapic_init_mappings(void);
164extern void ioapic_insert_resources(void);
164 165
165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); 166extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
166extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); 167extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
@@ -180,6 +181,7 @@ extern void ioapic_write_entry(int apic, int pin,
180#define io_apic_assign_pci_irqs 0 181#define io_apic_assign_pci_irqs 0
181static const int timer_through_8259 = 0; 182static const int timer_through_8259 = 0;
182static inline void ioapic_init_mappings(void) { } 183static inline void ioapic_init_mappings(void) { }
184static inline void ioapic_insert_resources(void) { }
183 185
184static inline void probe_nr_irqs_gsi(void) { } 186static inline void probe_nr_irqs_gsi(void) { }
185#endif 187#endif
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 2bdab21f0898..c6ccbe7e81ad 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -12,9 +12,15 @@ static inline unsigned long native_save_fl(void)
12{ 12{
13 unsigned long flags; 13 unsigned long flags;
14 14
15 /*
16 * Note: this needs to be "=r" not "=rm", because we have the
17 * stack offset from what gcc expects at the time the "pop" is
18 * executed, and so a memory reference with respect to the stack
19 * would end up using the wrong address.
20 */
15 asm volatile("# __raw_save_flags\n\t" 21 asm volatile("# __raw_save_flags\n\t"
16 "pushf ; pop %0" 22 "pushf ; pop %0"
17 : "=g" (flags) 23 : "=r" (flags)
18 : /* no input */ 24 : /* no input */
19 : "memory"); 25 : "memory");
20 26
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 313389cd50d2..5136dad57cbb 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -17,8 +17,7 @@
17/* Pages for switcher itself, then two pages per cpu */ 17/* Pages for switcher itself, then two pages per cpu */
18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) 18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
19 19
20/* We map at -4M (-2M when PAE is activated) for ease of mapping 20/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */
21 * into the guest (one PTE page). */
22#ifdef CONFIG_X86_PAE 21#ifdef CONFIG_X86_PAE
23#define SWITCHER_ADDR 0xFFE00000 22#define SWITCHER_ADDR 0xFFE00000
24#else 23#else
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index d31c4a684078..ba0eed8aa1a6 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -30,27 +30,27 @@
30#include <asm/hw_irq.h> 30#include <asm/hw_irq.h>
31#include <asm/kvm_para.h> 31#include <asm/kvm_para.h>
32 32
33/*G:031 But first, how does our Guest contact the Host to ask for privileged 33/*G:030
34 * But first, how does our Guest contact the Host to ask for privileged
34 * operations? There are two ways: the direct way is to make a "hypercall", 35 * operations? There are two ways: the direct way is to make a "hypercall",
35 * to make requests of the Host Itself. 36 * to make requests of the Host Itself.
36 * 37 *
37 * We use the KVM hypercall mechanism. Seventeen hypercalls are 38 * We use the KVM hypercall mechanism, though completely different hypercall
38 * available: the hypercall number is put in the %eax register, and the 39 * numbers. Seventeen hypercalls are available: the hypercall number is put in
39 * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. 40 * the %eax register, and the arguments (when required) are placed in %ebx,
40 * If a return value makes sense, it's returned in %eax. 41 * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax.
41 * 42 *
42 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 43 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
43 * Host, rather than returning failure. This reflects Winston Churchill's 44 * Host, rather than returning failure. This reflects Winston Churchill's
44 * definition of a gentleman: "someone who is only rude intentionally". */ 45 * definition of a gentleman: "someone who is only rude intentionally".
45/*:*/ 46:*/
46 47
47/* Can't use our min() macro here: needs to be a constant */ 48/* Can't use our min() macro here: needs to be a constant */
48#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 49#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
49 50
50#define LHCALL_RING_SIZE 64 51#define LHCALL_RING_SIZE 64
51struct hcall_args { 52struct hcall_args {
52 /* These map directly onto eax, ebx, ecx, edx and esi 53 /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
53 * in struct lguest_regs */
54 unsigned long arg0, arg1, arg2, arg3, arg4; 54 unsigned long arg0, arg1, arg2, arg3, arg4;
55}; 55};
56 56
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1692fb5050e3..6be7fc254b59 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -246,10 +246,6 @@
246#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38) 246#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38)
247#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39) 247#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39)
248 248
249/* Intel Model 6 */
250#define MSR_P6_EVNTSEL0 0x00000186
251#define MSR_P6_EVNTSEL1 0x00000187
252
253/* P4/Xeon+ specific */ 249/* P4/Xeon+ specific */
254#define MSR_IA32_MCG_EAX 0x00000180 250#define MSR_IA32_MCG_EAX 0x00000180
255#define MSR_IA32_MCG_EBX 0x00000181 251#define MSR_IA32_MCG_EBX 0x00000181
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c97264409934..c86e5ed4af51 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -72,7 +72,6 @@ void lapic_watchdog_stop(void);
72int lapic_watchdog_init(unsigned nmi_hz); 72int lapic_watchdog_init(unsigned nmi_hz);
73int lapic_wd_event(unsigned nmi_hz); 73int lapic_wd_event(unsigned nmi_hz);
74unsigned lapic_adjust_nmi_hz(unsigned hz); 74unsigned lapic_adjust_nmi_hz(unsigned hz);
75int lapic_watchdog_ok(void);
76void disable_lapic_nmi_watchdog(void); 75void disable_lapic_nmi_watchdog(void);
77void enable_lapic_nmi_watchdog(void); 76void enable_lapic_nmi_watchdog(void);
78void stop_nmi(void); 77void stop_nmi(void);
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index dd14c54ac718..0e8c2a0fd922 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -46,7 +46,13 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte)
46 __free_page(pte); 46 __free_page(pte);
47} 47}
48 48
49extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte); 49extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
50
51static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
52 unsigned long address)
53{
54 ___pte_free_tlb(tlb, pte);
55}
50 56
51static inline void pmd_populate_kernel(struct mm_struct *mm, 57static inline void pmd_populate_kernel(struct mm_struct *mm,
52 pmd_t *pmd, pte_t *pte) 58 pmd_t *pmd, pte_t *pte)
@@ -78,7 +84,13 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
78 free_page((unsigned long)pmd); 84 free_page((unsigned long)pmd);
79} 85}
80 86
81extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); 87extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
88
89static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
90 unsigned long adddress)
91{
92 ___pmd_free_tlb(tlb, pmd);
93}
82 94
83#ifdef CONFIG_X86_PAE 95#ifdef CONFIG_X86_PAE
84extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); 96extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
@@ -108,7 +120,14 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
108 free_page((unsigned long)pud); 120 free_page((unsigned long)pud);
109} 121}
110 122
111extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); 123extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
124
125static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
126 unsigned long address)
127{
128 ___pud_free_tlb(tlb, pud);
129}
130
112#endif /* PAGETABLE_LEVELS > 3 */ 131#endif /* PAGETABLE_LEVELS > 3 */
113#endif /* PAGETABLE_LEVELS > 2 */ 132#endif /* PAGETABLE_LEVELS > 2 */
114 133
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index b7e5db876399..4e77853321db 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -302,4 +302,8 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
302#define _raw_read_relax(lock) cpu_relax() 302#define _raw_read_relax(lock) cpu_relax()
303#define _raw_write_relax(lock) cpu_relax() 303#define _raw_write_relax(lock) cpu_relax()
304 304
305/* The {read|write|spin}_lock() on x86 are full memory barriers. */
306static inline void smp_mb__after_lock(void) { }
307#define ARCH_HAS_SMP_MB_AFTER_LOCK
308
305#endif /* _ASM_X86_SPINLOCK_H */ 309#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index f517944b2b17..cf86a5e73815 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -3,6 +3,8 @@
3 3
4extern int kstack_depth_to_print; 4extern int kstack_depth_to_print;
5 5
6int x86_is_stack_id(int id, char *name);
7
6/* Generic stack tracer with callbacks */ 8/* Generic stack tracer with callbacks */
7 9
8struct stacktrace_ops { 10struct stacktrace_ops {
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index b0783520988b..fad7d40b75f8 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -49,7 +49,7 @@ struct thread_info {
49 .exec_domain = &default_exec_domain, \ 49 .exec_domain = &default_exec_domain, \
50 .flags = 0, \ 50 .flags = 0, \
51 .cpu = 0, \ 51 .cpu = 0, \
52 .preempt_count = 1, \ 52 .preempt_count = INIT_PREEMPT_COUNT, \
53 .addr_limit = KERNEL_DS, \ 53 .addr_limit = KERNEL_DS, \
54 .restart_block = { \ 54 .restart_block = { \
55 .fn = do_no_restart_syscall, \ 55 .fn = do_no_restart_syscall, \
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 20e6a795e160..d2c6c930b491 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -212,9 +212,9 @@ extern int __get_user_bad(void);
212 : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") 212 : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
213#else 213#else
214#define __put_user_asm_u64(x, ptr, retval, errret) \ 214#define __put_user_asm_u64(x, ptr, retval, errret) \
215 __put_user_asm(x, ptr, retval, "q", "", "Zr", errret) 215 __put_user_asm(x, ptr, retval, "q", "", "er", errret)
216#define __put_user_asm_ex_u64(x, addr) \ 216#define __put_user_asm_ex_u64(x, addr) \
217 __put_user_asm_ex(x, addr, "q", "", "Zr") 217 __put_user_asm_ex(x, addr, "q", "", "er")
218#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) 218#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
219#endif 219#endif
220 220
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 8cc687326eb8..db24b215fc50 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -88,11 +88,11 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
88 ret, "l", "k", "ir", 4); 88 ret, "l", "k", "ir", 4);
89 return ret; 89 return ret;
90 case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, 90 case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
91 ret, "q", "", "ir", 8); 91 ret, "q", "", "er", 8);
92 return ret; 92 return ret;
93 case 10: 93 case 10:
94 __put_user_asm(*(u64 *)src, (u64 __user *)dst, 94 __put_user_asm(*(u64 *)src, (u64 __user *)dst,
95 ret, "q", "", "ir", 10); 95 ret, "q", "", "er", 10);
96 if (unlikely(ret)) 96 if (unlikely(ret))
97 return ret; 97 return ret;
98 asm("":::"memory"); 98 asm("":::"memory");
@@ -101,12 +101,12 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
101 return ret; 101 return ret;
102 case 16: 102 case 16:
103 __put_user_asm(*(u64 *)src, (u64 __user *)dst, 103 __put_user_asm(*(u64 *)src, (u64 __user *)dst,
104 ret, "q", "", "ir", 16); 104 ret, "q", "", "er", 16);
105 if (unlikely(ret)) 105 if (unlikely(ret))
106 return ret; 106 return ret;
107 asm("":::"memory"); 107 asm("":::"memory");
108 __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, 108 __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
109 ret, "q", "", "ir", 8); 109 ret, "q", "", "er", 8);
110 return ret; 110 return ret;
111 default: 111 default:
112 return copy_user_generic((__force void *)dst, src, size); 112 return copy_user_generic((__force void *)dst, src, size);
@@ -157,7 +157,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
157 ret, "q", "", "=r", 8); 157 ret, "q", "", "=r", 8);
158 if (likely(!ret)) 158 if (likely(!ret))
159 __put_user_asm(tmp, (u64 __user *)dst, 159 __put_user_asm(tmp, (u64 __user *)dst,
160 ret, "q", "", "ir", 8); 160 ret, "q", "", "er", 8);
161 return ret; 161 return ret;
162 } 162 }
163 default: 163 default:
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 341070f7ad5c..77a68505419a 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -175,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
175#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) 175#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
176 176
177#define UV_GLOBAL_MMR64_PNODE_BITS(p) \ 177#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
178 ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) 178 (((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
179 179
180#define UV_APIC_PNODE_SHIFT 6 180#define UV_APIC_PNODE_SHIFT 6
181 181
@@ -327,6 +327,7 @@ struct uv_blade_info {
327 unsigned short nr_possible_cpus; 327 unsigned short nr_possible_cpus;
328 unsigned short nr_online_cpus; 328 unsigned short nr_online_cpus;
329 unsigned short pnode; 329 unsigned short pnode;
330 short memory_nid;
330}; 331};
331extern struct uv_blade_info *uv_blade_info; 332extern struct uv_blade_info *uv_blade_info;
332extern short *uv_node_to_blade; 333extern short *uv_node_to_blade;
@@ -363,6 +364,12 @@ static inline int uv_blade_to_pnode(int bid)
363 return uv_blade_info[bid].pnode; 364 return uv_blade_info[bid].pnode;
364} 365}
365 366
367/* Nid of memory node on blade. -1 if no blade-local memory */
368static inline int uv_blade_to_memory_nid(int bid)
369{
370 return uv_blade_info[bid].memory_nid;
371}
372
366/* Determine the number of possible cpus on a blade */ 373/* Determine the number of possible cpus on a blade */
367static inline int uv_blade_nr_possible_cpus(int bid) 374static inline int uv_blade_nr_possible_cpus(int bid)
368{ 375{
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 9372f0406ad4..6c99f5037801 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1192,7 +1192,7 @@ out:
1192 return 0; 1192 return 0;
1193} 1193}
1194 1194
1195struct notifier_block device_nb = { 1195static struct notifier_block device_nb = {
1196 .notifier_call = device_change_notifier, 1196 .notifier_call = device_change_notifier,
1197}; 1197};
1198 1198
@@ -1763,7 +1763,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
1763 flag |= __GFP_ZERO; 1763 flag |= __GFP_ZERO;
1764 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 1764 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1765 if (!virt_addr) 1765 if (!virt_addr)
1766 return 0; 1766 return NULL;
1767 1767
1768 paddr = virt_to_phys(virt_addr); 1768 paddr = virt_to_phys(virt_addr);
1769 1769
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 10b2accd12ea..c1b17e97252e 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -472,6 +472,8 @@ static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
472 if (iommu->evt_buf == NULL) 472 if (iommu->evt_buf == NULL)
473 return NULL; 473 return NULL;
474 474
475 iommu->evt_buf_size = EVT_BUFFER_SIZE;
476
475 return iommu->evt_buf; 477 return iommu->evt_buf;
476} 478}
477 479
@@ -691,6 +693,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
691 693
692 devid = e->devid; 694 devid = e->devid;
693 devid_to = e->ext >> 8; 695 devid_to = e->ext >> 8;
696 set_dev_entry_from_acpi(iommu, devid , e->flags, 0);
694 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); 697 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
695 amd_iommu_alias_table[devid] = devid_to; 698 amd_iommu_alias_table[devid] = devid_to;
696 break; 699 break;
@@ -749,11 +752,13 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
749 752
750 devid = e->devid; 753 devid = e->devid;
751 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 754 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
752 if (alias) 755 if (alias) {
753 amd_iommu_alias_table[dev_i] = devid_to; 756 amd_iommu_alias_table[dev_i] = devid_to;
754 set_dev_entry_from_acpi(iommu, 757 set_dev_entry_from_acpi(iommu,
755 amd_iommu_alias_table[dev_i], 758 devid_to, flags, ext_flags);
756 flags, ext_flags); 759 }
760 set_dev_entry_from_acpi(iommu, dev_i,
761 flags, ext_flags);
757 } 762 }
758 break; 763 break;
759 default: 764 default:
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8c7c042ecad1..0a1c2830ec66 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -140,7 +140,6 @@ int x2apic_mode;
140#ifdef CONFIG_X86_X2APIC 140#ifdef CONFIG_X86_X2APIC
141/* x2apic enabled before OS handover */ 141/* x2apic enabled before OS handover */
142static int x2apic_preenabled; 142static int x2apic_preenabled;
143static int disable_x2apic;
144static __init int setup_nox2apic(char *str) 143static __init int setup_nox2apic(char *str)
145{ 144{
146 if (x2apic_enabled()) { 145 if (x2apic_enabled()) {
@@ -149,7 +148,6 @@ static __init int setup_nox2apic(char *str)
149 return 0; 148 return 0;
150 } 149 }
151 150
152 disable_x2apic = 1;
153 setup_clear_cpu_cap(X86_FEATURE_X2APIC); 151 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
154 return 0; 152 return 0;
155} 153}
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 69328ac8de9c..8952a5890281 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -652,7 +652,8 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
652 return ret && es7000_apic_is_cluster(); 652 return ret && es7000_apic_is_cluster();
653} 653}
654 654
655struct apic apic_es7000_cluster = { 655/* We've been warned by a false positive warning.Use __refdata to keep calm. */
656struct apic __refdata apic_es7000_cluster = {
656 657
657 .name = "es7000", 658 .name = "es7000",
658 .probe = probe_es7000, 659 .probe = probe_es7000,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 4d0216fcb36c..d2ed6c5ddc80 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1716,25 +1716,19 @@ __apicdebuginit(void) print_IO_APIC(void)
1716 return; 1716 return;
1717} 1717}
1718 1718
1719__apicdebuginit(void) print_APIC_bitfield(int base) 1719__apicdebuginit(void) print_APIC_field(int base)
1720{ 1720{
1721 unsigned int v; 1721 int i;
1722 int i, j;
1723 1722
1724 if (apic_verbosity == APIC_QUIET) 1723 if (apic_verbosity == APIC_QUIET)
1725 return; 1724 return;
1726 1725
1727 printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); 1726 printk(KERN_DEBUG);
1728 for (i = 0; i < 8; i++) { 1727
1729 v = apic_read(base + i*0x10); 1728 for (i = 0; i < 8; i++)
1730 for (j = 0; j < 32; j++) { 1729 printk(KERN_CONT "%08x", apic_read(base + i*0x10));
1731 if (v & (1<<j)) 1730
1732 printk("1"); 1731 printk(KERN_CONT "\n");
1733 else
1734 printk("0");
1735 }
1736 printk("\n");
1737 }
1738} 1732}
1739 1733
1740__apicdebuginit(void) print_local_APIC(void *dummy) 1734__apicdebuginit(void) print_local_APIC(void *dummy)
@@ -1745,7 +1739,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1745 if (apic_verbosity == APIC_QUIET) 1739 if (apic_verbosity == APIC_QUIET)
1746 return; 1740 return;
1747 1741
1748 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1742 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1749 smp_processor_id(), hard_smp_processor_id()); 1743 smp_processor_id(), hard_smp_processor_id());
1750 v = apic_read(APIC_ID); 1744 v = apic_read(APIC_ID);
1751 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); 1745 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
@@ -1786,11 +1780,11 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1786 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); 1780 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1787 1781
1788 printk(KERN_DEBUG "... APIC ISR field:\n"); 1782 printk(KERN_DEBUG "... APIC ISR field:\n");
1789 print_APIC_bitfield(APIC_ISR); 1783 print_APIC_field(APIC_ISR);
1790 printk(KERN_DEBUG "... APIC TMR field:\n"); 1784 printk(KERN_DEBUG "... APIC TMR field:\n");
1791 print_APIC_bitfield(APIC_TMR); 1785 print_APIC_field(APIC_TMR);
1792 printk(KERN_DEBUG "... APIC IRR field:\n"); 1786 printk(KERN_DEBUG "... APIC IRR field:\n");
1793 print_APIC_bitfield(APIC_IRR); 1787 print_APIC_field(APIC_IRR);
1794 1788
1795 if (APIC_INTEGRATED(ver)) { /* !82489DX */ 1789 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1796 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 1790 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
@@ -3799,6 +3793,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3799 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3793 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3800 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 3794 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3801 3795
3796 if (cfg->move_in_progress)
3797 send_cleanup_vector(cfg);
3798
3802 return irq; 3799 return irq;
3803} 3800}
3804 3801
@@ -4187,28 +4184,20 @@ fake_ioapic_page:
4187 } 4184 }
4188} 4185}
4189 4186
4190static int __init ioapic_insert_resources(void) 4187void __init ioapic_insert_resources(void)
4191{ 4188{
4192 int i; 4189 int i;
4193 struct resource *r = ioapic_resources; 4190 struct resource *r = ioapic_resources;
4194 4191
4195 if (!r) { 4192 if (!r) {
4196 if (nr_ioapics > 0) { 4193 if (nr_ioapics > 0)
4197 printk(KERN_ERR 4194 printk(KERN_ERR
4198 "IO APIC resources couldn't be allocated.\n"); 4195 "IO APIC resources couldn't be allocated.\n");
4199 return -1; 4196 return;
4200 }
4201 return 0;
4202 } 4197 }
4203 4198
4204 for (i = 0; i < nr_ioapics; i++) { 4199 for (i = 0; i < nr_ioapics; i++) {
4205 insert_resource(&iomem_resource, r); 4200 insert_resource(&iomem_resource, r);
4206 r++; 4201 r++;
4207 } 4202 }
4208
4209 return 0;
4210} 4203}
4211
4212/* Insert the IO APIC resources after PCI initialization has occured to handle
4213 * IO APICS that are mapped in on a BAR in PCI space. */
4214late_initcall(ioapic_insert_resources);
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 533e59c6fc82..ca96e68f0d23 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -493,7 +493,8 @@ static void numaq_setup_portio_remap(void)
493 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); 493 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
494} 494}
495 495
496struct apic apic_numaq = { 496/* Use __refdata to keep false positive warning calm. */
497struct apic __refdata apic_numaq = {
497 498
498 .name = "NUMAQ", 499 .name = "NUMAQ",
499 .probe = probe_numaq, 500 .probe = probe_numaq,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8e4cbb255c38..2ed4e2bb3b32 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -170,7 +170,7 @@ static unsigned long set_apic_id(unsigned int id)
170 170
171static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) 171static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
172{ 172{
173 return current_cpu_data.initial_apicid >> index_msb; 173 return initial_apicid >> index_msb;
174} 174}
175 175
176static void x2apic_send_IPI_self(int vector) 176static void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a284359627e7..0b631c6a2e00 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -162,7 +162,7 @@ static unsigned long set_apic_id(unsigned int id)
162 162
163static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) 163static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
164{ 164{
165 return current_cpu_data.initial_apicid >> index_msb; 165 return initial_apicid >> index_msb;
166} 166}
167 167
168static void x2apic_send_IPI_self(int vector) 168static void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 096d19aea2f7..832e908adcb5 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -261,7 +261,7 @@ struct apic apic_x2apic_uv_x = {
261 .apic_id_registered = uv_apic_id_registered, 261 .apic_id_registered = uv_apic_id_registered,
262 262
263 .irq_delivery_mode = dest_Fixed, 263 .irq_delivery_mode = dest_Fixed,
264 .irq_dest_mode = 1, /* logical */ 264 .irq_dest_mode = 0, /* physical */
265 265
266 .target_cpus = uv_target_cpus, 266 .target_cpus = uv_target_cpus,
267 .disable_esr = 0, 267 .disable_esr = 0,
@@ -362,12 +362,6 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
362 BUG(); 362 BUG();
363} 363}
364 364
365static __init void map_low_mmrs(void)
366{
367 init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
368 init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
369}
370
371enum map_type {map_wb, map_uc}; 365enum map_type {map_wb, map_uc};
372 366
373static __init void map_high(char *id, unsigned long base, int shift, 367static __init void map_high(char *id, unsigned long base, int shift,
@@ -395,26 +389,6 @@ static __init void map_gru_high(int max_pnode)
395 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
396} 390}
397 391
398static __init void map_config_high(int max_pnode)
399{
400 union uvh_rh_gam_cfg_overlay_config_mmr_u cfg;
401 int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT;
402
403 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
404 if (cfg.s.enable)
405 map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc);
406}
407
408static __init void map_mmr_high(int max_pnode)
409{
410 union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
411 int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
412
413 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
414 if (mmr.s.enable)
415 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
416}
417
418static __init void map_mmioh_high(int max_pnode) 392static __init void map_mmioh_high(int max_pnode)
419{ 393{
420 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; 394 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
@@ -566,8 +540,6 @@ void __init uv_system_init(void)
566 unsigned long mmr_base, present, paddr; 540 unsigned long mmr_base, present, paddr;
567 unsigned short pnode_mask; 541 unsigned short pnode_mask;
568 542
569 map_low_mmrs();
570
571 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 543 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
572 m_val = m_n_config.s.m_skt; 544 m_val = m_n_config.s.m_skt;
573 n_val = m_n_config.s.n_skt; 545 n_val = m_n_config.s.n_skt;
@@ -591,6 +563,8 @@ void __init uv_system_init(void)
591 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 563 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
592 uv_blade_info = kmalloc(bytes, GFP_KERNEL); 564 uv_blade_info = kmalloc(bytes, GFP_KERNEL);
593 BUG_ON(!uv_blade_info); 565 BUG_ON(!uv_blade_info);
566 for (blade = 0; blade < uv_num_possible_blades(); blade++)
567 uv_blade_info[blade].memory_nid = -1;
594 568
595 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); 569 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
596 570
@@ -629,6 +603,9 @@ void __init uv_system_init(void)
629 lcpu = uv_blade_info[blade].nr_possible_cpus; 603 lcpu = uv_blade_info[blade].nr_possible_cpus;
630 uv_blade_info[blade].nr_possible_cpus++; 604 uv_blade_info[blade].nr_possible_cpus++;
631 605
606 /* Any node on the blade, else will contain -1. */
607 uv_blade_info[blade].memory_nid = nid;
608
632 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; 609 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
633 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; 610 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
634 uv_cpu_hub_info(cpu)->m_val = m_val; 611 uv_cpu_hub_info(cpu)->m_val = m_val;
@@ -662,11 +639,10 @@ void __init uv_system_init(void)
662 pnode = (paddr >> m_val) & pnode_mask; 639 pnode = (paddr >> m_val) & pnode_mask;
663 blade = boot_pnode_to_blade(pnode); 640 blade = boot_pnode_to_blade(pnode);
664 uv_node_to_blade[nid] = blade; 641 uv_node_to_blade[nid] = blade;
642 max_pnode = max(pnode, max_pnode);
665 } 643 }
666 644
667 map_gru_high(max_pnode); 645 map_gru_high(max_pnode);
668 map_mmr_high(max_pnode);
669 map_config_high(max_pnode);
670 map_mmioh_high(max_pnode); 646 map_mmioh_high(max_pnode);
671 647
672 uv_cpu_init(); 648 uv_cpu_init();
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 79302e9a33a4..442b5508893f 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -811,7 +811,7 @@ static int apm_do_idle(void)
811 u8 ret = 0; 811 u8 ret = 0;
812 int idled = 0; 812 int idled = 0;
813 int polling; 813 int polling;
814 int err; 814 int err = 0;
815 815
816 polling = !!(current_thread_info()->status & TS_POLLING); 816 polling = !!(current_thread_info()->status & TS_POLLING);
817 if (polling) { 817 if (polling) {
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 28e5f5956042..e2485b03f1cf 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -356,7 +356,7 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
356#endif 356#endif
357#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) 357#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
358 /* check CPU config space for extended APIC ID */ 358 /* check CPU config space for extended APIC ID */
359 if (c->x86 >= 0xf) { 359 if (cpu_has_apic && c->x86 >= 0xf) {
360 unsigned int val; 360 unsigned int val;
361 val = read_pci_config(0, 24, 0, 0x68); 361 val = read_pci_config(0, 24, 0, 0x68);
362 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) 362 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 81cbe64ed6b4..2a50ef891000 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -299,7 +299,7 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
299static int transition_fid_vid(struct powernow_k8_data *data, 299static int transition_fid_vid(struct powernow_k8_data *data,
300 u32 reqfid, u32 reqvid) 300 u32 reqfid, u32 reqvid)
301{ 301{
302 if (core_voltage_pre_transition(data, reqvid)) 302 if (core_voltage_pre_transition(data, reqvid, reqfid))
303 return 1; 303 return 1;
304 304
305 if (core_frequency_transition(data, reqfid)) 305 if (core_frequency_transition(data, reqfid))
@@ -327,17 +327,20 @@ static int transition_fid_vid(struct powernow_k8_data *data,
327 327
328/* Phase 1 - core voltage transition ... setup voltage */ 328/* Phase 1 - core voltage transition ... setup voltage */
329static int core_voltage_pre_transition(struct powernow_k8_data *data, 329static int core_voltage_pre_transition(struct powernow_k8_data *data,
330 u32 reqvid) 330 u32 reqvid, u32 reqfid)
331{ 331{
332 u32 rvosteps = data->rvo; 332 u32 rvosteps = data->rvo;
333 u32 savefid = data->currfid; 333 u32 savefid = data->currfid;
334 u32 maxvid, lo; 334 u32 maxvid, lo, rvomult = 1;
335 335
336 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " 336 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
337 "reqvid 0x%x, rvo 0x%x\n", 337 "reqvid 0x%x, rvo 0x%x\n",
338 smp_processor_id(), 338 smp_processor_id(),
339 data->currfid, data->currvid, reqvid, data->rvo); 339 data->currfid, data->currvid, reqvid, data->rvo);
340 340
341 if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
342 rvomult = 2;
343 rvosteps *= rvomult;
341 rdmsr(MSR_FIDVID_STATUS, lo, maxvid); 344 rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
342 maxvid = 0x1f & (maxvid >> 16); 345 maxvid = 0x1f & (maxvid >> 16);
343 dprintk("ph1 maxvid=0x%x\n", maxvid); 346 dprintk("ph1 maxvid=0x%x\n", maxvid);
@@ -351,7 +354,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data,
351 return 1; 354 return 1;
352 } 355 }
353 356
354 while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) { 357 while ((rvosteps > 0) &&
358 ((rvomult * data->rvo + data->currvid) > reqvid)) {
355 if (data->currvid == maxvid) { 359 if (data->currvid == maxvid) {
356 rvosteps = 0; 360 rvosteps = 0;
357 } else { 361 } else {
@@ -384,13 +388,6 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
384 u32 vcoreqfid, vcocurrfid, vcofiddiff; 388 u32 vcoreqfid, vcocurrfid, vcofiddiff;
385 u32 fid_interval, savevid = data->currvid; 389 u32 fid_interval, savevid = data->currvid;
386 390
387 if ((reqfid < HI_FID_TABLE_BOTTOM) &&
388 (data->currfid < HI_FID_TABLE_BOTTOM)) {
389 printk(KERN_ERR PFX "ph2: illegal lo-lo transition "
390 "0x%x 0x%x\n", reqfid, data->currfid);
391 return 1;
392 }
393
394 if (data->currfid == reqfid) { 391 if (data->currfid == reqfid) {
395 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", 392 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
396 data->currfid); 393 data->currfid);
@@ -407,6 +404,9 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
407 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid 404 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
408 : vcoreqfid - vcocurrfid; 405 : vcoreqfid - vcocurrfid;
409 406
407 if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
408 vcofiddiff = 0;
409
410 while (vcofiddiff > 2) { 410 while (vcofiddiff > 2) {
411 (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); 411 (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
412 412
@@ -1081,14 +1081,6 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,
1081 return 0; 1081 return 0;
1082 } 1082 }
1083 1083
1084 if ((fid < HI_FID_TABLE_BOTTOM) &&
1085 (data->currfid < HI_FID_TABLE_BOTTOM)) {
1086 printk(KERN_ERR PFX
1087 "ignoring illegal change in lo freq table-%x to 0x%x\n",
1088 data->currfid, fid);
1089 return 1;
1090 }
1091
1092 dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", 1084 dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
1093 smp_processor_id(), fid, vid); 1085 smp_processor_id(), fid, vid);
1094 freqs.old = find_khz_freq_from_fid(data->currfid); 1086 freqs.old = find_khz_freq_from_fid(data->currfid);
@@ -1267,7 +1259,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1267{ 1259{
1268 static const char ACPI_PSS_BIOS_BUG_MSG[] = 1260 static const char ACPI_PSS_BIOS_BUG_MSG[] =
1269 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" 1261 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
1270 KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; 1262 FW_BUG PFX "Try again with latest BIOS.\n";
1271 struct powernow_k8_data *data; 1263 struct powernow_k8_data *data;
1272 struct init_on_cpu init_on_cpu; 1264 struct init_on_cpu init_on_cpu;
1273 int rc; 1265 int rc;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index c9c1190b5e1f..02ce824073cb 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -215,7 +215,8 @@ struct pst_s {
215 215
216#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) 216#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
217 217
218static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid); 218static int core_voltage_pre_transition(struct powernow_k8_data *data,
219 u32 reqvid, u32 regfid);
219static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); 220static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
220static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); 221static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
221 222
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index af425b83202b..1cfb623ce11c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -194,14 +194,14 @@ static void print_mce(struct mce *m)
194 m->cs, m->ip); 194 m->cs, m->ip);
195 if (m->cs == __KERNEL_CS) 195 if (m->cs == __KERNEL_CS)
196 print_symbol("{%s}", m->ip); 196 print_symbol("{%s}", m->ip);
197 printk("\n"); 197 printk(KERN_CONT "\n");
198 } 198 }
199 printk(KERN_EMERG "TSC %llx ", m->tsc); 199 printk(KERN_EMERG "TSC %llx ", m->tsc);
200 if (m->addr) 200 if (m->addr)
201 printk("ADDR %llx ", m->addr); 201 printk(KERN_CONT "ADDR %llx ", m->addr);
202 if (m->misc) 202 if (m->misc)
203 printk("MISC %llx ", m->misc); 203 printk(KERN_CONT "MISC %llx ", m->misc);
204 printk("\n"); 204 printk(KERN_CONT "\n");
205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 m->cpuvendor, m->cpuid, m->time, m->socketid, 206 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 m->apicid); 207 m->apicid);
@@ -209,13 +209,13 @@ static void print_mce(struct mce *m)
209 209
210static void print_mce_head(void) 210static void print_mce_head(void)
211{ 211{
212 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); 212 printk(KERN_EMERG "\nHARDWARE ERROR\n");
213} 213}
214 214
215static void print_mce_tail(void) 215static void print_mce_tail(void)
216{ 216{
217 printk(KERN_EMERG "This is not a software problem!\n" 217 printk(KERN_EMERG "This is not a software problem!\n"
218 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); 218 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
219} 219}
220 220
221#define PANIC_TIMEOUT 5 /* 5 seconds */ 221#define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -1692,17 +1692,15 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1692 const char *buf, size_t siz) 1692 const char *buf, size_t siz)
1693{ 1693{
1694 char *p; 1694 char *p;
1695 int len;
1696 1695
1697 strncpy(mce_helper, buf, sizeof(mce_helper)); 1696 strncpy(mce_helper, buf, sizeof(mce_helper));
1698 mce_helper[sizeof(mce_helper)-1] = 0; 1697 mce_helper[sizeof(mce_helper)-1] = 0;
1699 len = strlen(mce_helper);
1700 p = strchr(mce_helper, '\n'); 1698 p = strchr(mce_helper, '\n');
1701 1699
1702 if (*p) 1700 if (p)
1703 *p = 0; 1701 *p = 0;
1704 1702
1705 return len; 1703 return strlen(mce_helper) + !!p;
1706} 1704}
1707 1705
1708static ssize_t set_ignore_ce(struct sys_device *s, 1706static ssize_t set_ignore_ce(struct sys_device *s,
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d4cf4ce19aac..a7aa8f900954 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -66,6 +66,52 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
66}; 66};
67 67
68/* 68/*
69 * Not sure about some of these
70 */
71static const u64 p6_perfmon_event_map[] =
72{
73 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
74 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
75 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0000,
76 [PERF_COUNT_HW_CACHE_MISSES] = 0x0000,
77 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
78 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
79 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
80};
81
82static u64 p6_pmu_event_map(int event)
83{
84 return p6_perfmon_event_map[event];
85}
86
87/*
88 * Counter setting that is specified not to count anything.
89 * We use this to effectively disable a counter.
90 *
91 * L2_RQSTS with 0 MESI unit mask.
92 */
93#define P6_NOP_COUNTER 0x0000002EULL
94
95static u64 p6_pmu_raw_event(u64 event)
96{
97#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
98#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
99#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
100#define P6_EVNTSEL_INV_MASK 0x00800000ULL
101#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL
102
103#define P6_EVNTSEL_MASK \
104 (P6_EVNTSEL_EVENT_MASK | \
105 P6_EVNTSEL_UNIT_MASK | \
106 P6_EVNTSEL_EDGE_MASK | \
107 P6_EVNTSEL_INV_MASK | \
108 P6_EVNTSEL_COUNTER_MASK)
109
110 return event & P6_EVNTSEL_MASK;
111}
112
113
114/*
69 * Intel PerfMon v3. Used on Core2 and later. 115 * Intel PerfMon v3. Used on Core2 and later.
70 */ 116 */
71static const u64 intel_perfmon_event_map[] = 117static const u64 intel_perfmon_event_map[] =
@@ -666,6 +712,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
666{ 712{
667 struct perf_counter_attr *attr = &counter->attr; 713 struct perf_counter_attr *attr = &counter->attr;
668 struct hw_perf_counter *hwc = &counter->hw; 714 struct hw_perf_counter *hwc = &counter->hw;
715 u64 config;
669 int err; 716 int err;
670 717
671 if (!x86_pmu_initialized()) 718 if (!x86_pmu_initialized())
@@ -718,14 +765,40 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
718 765
719 if (attr->config >= x86_pmu.max_events) 766 if (attr->config >= x86_pmu.max_events)
720 return -EINVAL; 767 return -EINVAL;
768
721 /* 769 /*
722 * The generic map: 770 * The generic map:
723 */ 771 */
724 hwc->config |= x86_pmu.event_map(attr->config); 772 config = x86_pmu.event_map(attr->config);
773
774 if (config == 0)
775 return -ENOENT;
776
777 if (config == -1LL)
778 return -EINVAL;
779
780 hwc->config |= config;
725 781
726 return 0; 782 return 0;
727} 783}
728 784
785static void p6_pmu_disable_all(void)
786{
787 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
788 u64 val;
789
790 if (!cpuc->enabled)
791 return;
792
793 cpuc->enabled = 0;
794 barrier();
795
796 /* p6 only has one enable register */
797 rdmsrl(MSR_P6_EVNTSEL0, val);
798 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
799 wrmsrl(MSR_P6_EVNTSEL0, val);
800}
801
729static void intel_pmu_disable_all(void) 802static void intel_pmu_disable_all(void)
730{ 803{
731 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 804 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
@@ -767,6 +840,23 @@ void hw_perf_disable(void)
767 return x86_pmu.disable_all(); 840 return x86_pmu.disable_all();
768} 841}
769 842
843static void p6_pmu_enable_all(void)
844{
845 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
846 unsigned long val;
847
848 if (cpuc->enabled)
849 return;
850
851 cpuc->enabled = 1;
852 barrier();
853
854 /* p6 only has one enable register */
855 rdmsrl(MSR_P6_EVNTSEL0, val);
856 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
857 wrmsrl(MSR_P6_EVNTSEL0, val);
858}
859
770static void intel_pmu_enable_all(void) 860static void intel_pmu_enable_all(void)
771{ 861{
772 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 862 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
@@ -784,13 +874,13 @@ static void amd_pmu_enable_all(void)
784 barrier(); 874 barrier();
785 875
786 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 876 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
877 struct perf_counter *counter = cpuc->counters[idx];
787 u64 val; 878 u64 val;
788 879
789 if (!test_bit(idx, cpuc->active_mask)) 880 if (!test_bit(idx, cpuc->active_mask))
790 continue; 881 continue;
791 rdmsrl(MSR_K7_EVNTSEL0 + idx, val); 882
792 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) 883 val = counter->hw.config;
793 continue;
794 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 884 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
795 wrmsrl(MSR_K7_EVNTSEL0 + idx, val); 885 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
796 } 886 }
@@ -819,16 +909,13 @@ static inline void intel_pmu_ack_status(u64 ack)
819 909
820static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 910static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
821{ 911{
822 int err; 912 (void)checking_wrmsrl(hwc->config_base + idx,
823 err = checking_wrmsrl(hwc->config_base + idx,
824 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); 913 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
825} 914}
826 915
827static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 916static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
828{ 917{
829 int err; 918 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
830 err = checking_wrmsrl(hwc->config_base + idx,
831 hwc->config);
832} 919}
833 920
834static inline void 921static inline void
@@ -836,13 +923,24 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
836{ 923{
837 int idx = __idx - X86_PMC_IDX_FIXED; 924 int idx = __idx - X86_PMC_IDX_FIXED;
838 u64 ctrl_val, mask; 925 u64 ctrl_val, mask;
839 int err;
840 926
841 mask = 0xfULL << (idx * 4); 927 mask = 0xfULL << (idx * 4);
842 928
843 rdmsrl(hwc->config_base, ctrl_val); 929 rdmsrl(hwc->config_base, ctrl_val);
844 ctrl_val &= ~mask; 930 ctrl_val &= ~mask;
845 err = checking_wrmsrl(hwc->config_base, ctrl_val); 931 (void)checking_wrmsrl(hwc->config_base, ctrl_val);
932}
933
934static inline void
935p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
936{
937 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
938 u64 val = P6_NOP_COUNTER;
939
940 if (cpuc->enabled)
941 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
942
943 (void)checking_wrmsrl(hwc->config_base + idx, val);
846} 944}
847 945
848static inline void 946static inline void
@@ -943,6 +1041,19 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
943 err = checking_wrmsrl(hwc->config_base, ctrl_val); 1041 err = checking_wrmsrl(hwc->config_base, ctrl_val);
944} 1042}
945 1043
1044static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
1045{
1046 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1047 u64 val;
1048
1049 val = hwc->config;
1050 if (cpuc->enabled)
1051 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
1052
1053 (void)checking_wrmsrl(hwc->config_base + idx, val);
1054}
1055
1056
946static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1057static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
947{ 1058{
948 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 1059 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -959,8 +1070,6 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
959 1070
960 if (cpuc->enabled) 1071 if (cpuc->enabled)
961 x86_pmu_enable_counter(hwc, idx); 1072 x86_pmu_enable_counter(hwc, idx);
962 else
963 x86_pmu_disable_counter(hwc, idx);
964} 1073}
965 1074
966static int 1075static int
@@ -1176,6 +1285,49 @@ static void intel_pmu_reset(void)
1176 local_irq_restore(flags); 1285 local_irq_restore(flags);
1177} 1286}
1178 1287
1288static int p6_pmu_handle_irq(struct pt_regs *regs)
1289{
1290 struct perf_sample_data data;
1291 struct cpu_hw_counters *cpuc;
1292 struct perf_counter *counter;
1293 struct hw_perf_counter *hwc;
1294 int idx, handled = 0;
1295 u64 val;
1296
1297 data.regs = regs;
1298 data.addr = 0;
1299
1300 cpuc = &__get_cpu_var(cpu_hw_counters);
1301
1302 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1303 if (!test_bit(idx, cpuc->active_mask))
1304 continue;
1305
1306 counter = cpuc->counters[idx];
1307 hwc = &counter->hw;
1308
1309 val = x86_perf_counter_update(counter, hwc, idx);
1310 if (val & (1ULL << (x86_pmu.counter_bits - 1)))
1311 continue;
1312
1313 /*
1314 * counter overflow
1315 */
1316 handled = 1;
1317 data.period = counter->hw.last_period;
1318
1319 if (!x86_perf_counter_set_period(counter, hwc, idx))
1320 continue;
1321
1322 if (perf_counter_overflow(counter, 1, &data))
1323 p6_pmu_disable_counter(hwc, idx);
1324 }
1325
1326 if (handled)
1327 inc_irq_stat(apic_perf_irqs);
1328
1329 return handled;
1330}
1179 1331
1180/* 1332/*
1181 * This handler is triggered by the local APIC, so the APIC IRQ handling 1333 * This handler is triggered by the local APIC, so the APIC IRQ handling
@@ -1185,14 +1337,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1185{ 1337{
1186 struct perf_sample_data data; 1338 struct perf_sample_data data;
1187 struct cpu_hw_counters *cpuc; 1339 struct cpu_hw_counters *cpuc;
1188 int bit, cpu, loops; 1340 int bit, loops;
1189 u64 ack, status; 1341 u64 ack, status;
1190 1342
1191 data.regs = regs; 1343 data.regs = regs;
1192 data.addr = 0; 1344 data.addr = 0;
1193 1345
1194 cpu = smp_processor_id(); 1346 cpuc = &__get_cpu_var(cpu_hw_counters);
1195 cpuc = &per_cpu(cpu_hw_counters, cpu);
1196 1347
1197 perf_disable(); 1348 perf_disable();
1198 status = intel_pmu_get_status(); 1349 status = intel_pmu_get_status();
@@ -1249,14 +1400,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1249 struct cpu_hw_counters *cpuc; 1400 struct cpu_hw_counters *cpuc;
1250 struct perf_counter *counter; 1401 struct perf_counter *counter;
1251 struct hw_perf_counter *hwc; 1402 struct hw_perf_counter *hwc;
1252 int cpu, idx, handled = 0; 1403 int idx, handled = 0;
1253 u64 val; 1404 u64 val;
1254 1405
1255 data.regs = regs; 1406 data.regs = regs;
1256 data.addr = 0; 1407 data.addr = 0;
1257 1408
1258 cpu = smp_processor_id(); 1409 cpuc = &__get_cpu_var(cpu_hw_counters);
1259 cpuc = &per_cpu(cpu_hw_counters, cpu);
1260 1410
1261 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1411 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1262 if (!test_bit(idx, cpuc->active_mask)) 1412 if (!test_bit(idx, cpuc->active_mask))
@@ -1353,6 +1503,32 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
1353 .priority = 1 1503 .priority = 1
1354}; 1504};
1355 1505
1506static struct x86_pmu p6_pmu = {
1507 .name = "p6",
1508 .handle_irq = p6_pmu_handle_irq,
1509 .disable_all = p6_pmu_disable_all,
1510 .enable_all = p6_pmu_enable_all,
1511 .enable = p6_pmu_enable_counter,
1512 .disable = p6_pmu_disable_counter,
1513 .eventsel = MSR_P6_EVNTSEL0,
1514 .perfctr = MSR_P6_PERFCTR0,
1515 .event_map = p6_pmu_event_map,
1516 .raw_event = p6_pmu_raw_event,
1517 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
1518 .max_period = (1ULL << 31) - 1,
1519 .version = 0,
1520 .num_counters = 2,
1521 /*
1522 * Counters have 40 bits implemented. However they are designed such
1523 * that bits [32-39] are sign extensions of bit 31. As such the
1524 * effective width of a counter for P6-like PMU is 32 bits only.
1525 *
1526 * See IA-32 Intel Architecture Software developer manual Vol 3B
1527 */
1528 .counter_bits = 32,
1529 .counter_mask = (1ULL << 32) - 1,
1530};
1531
1356static struct x86_pmu intel_pmu = { 1532static struct x86_pmu intel_pmu = {
1357 .name = "Intel", 1533 .name = "Intel",
1358 .handle_irq = intel_pmu_handle_irq, 1534 .handle_irq = intel_pmu_handle_irq,
@@ -1392,6 +1568,37 @@ static struct x86_pmu amd_pmu = {
1392 .max_period = (1ULL << 47) - 1, 1568 .max_period = (1ULL << 47) - 1,
1393}; 1569};
1394 1570
1571static int p6_pmu_init(void)
1572{
1573 switch (boot_cpu_data.x86_model) {
1574 case 1:
1575 case 3: /* Pentium Pro */
1576 case 5:
1577 case 6: /* Pentium II */
1578 case 7:
1579 case 8:
1580 case 11: /* Pentium III */
1581 break;
1582 case 9:
1583 case 13:
1584 /* Pentium M */
1585 break;
1586 default:
1587 pr_cont("unsupported p6 CPU model %d ",
1588 boot_cpu_data.x86_model);
1589 return -ENODEV;
1590 }
1591
1592 if (!cpu_has_apic) {
1593 pr_info("no Local APIC, try rebooting with lapic");
1594 return -ENODEV;
1595 }
1596
1597 x86_pmu = p6_pmu;
1598
1599 return 0;
1600}
1601
1395static int intel_pmu_init(void) 1602static int intel_pmu_init(void)
1396{ 1603{
1397 union cpuid10_edx edx; 1604 union cpuid10_edx edx;
@@ -1400,8 +1607,14 @@ static int intel_pmu_init(void)
1400 unsigned int ebx; 1607 unsigned int ebx;
1401 int version; 1608 int version;
1402 1609
1403 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 1610 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
1611 /* check for P6 processor family */
1612 if (boot_cpu_data.x86 == 6) {
1613 return p6_pmu_init();
1614 } else {
1404 return -ENODEV; 1615 return -ENODEV;
1616 }
1617 }
1405 1618
1406 /* 1619 /*
1407 * Check whether the Architectural PerfMon supports 1620 * Check whether the Architectural PerfMon supports
@@ -1561,6 +1774,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1561 1774
1562static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); 1775static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
1563static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); 1776static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
1777static DEFINE_PER_CPU(int, in_nmi_frame);
1564 1778
1565 1779
1566static void 1780static void
@@ -1576,7 +1790,9 @@ static void backtrace_warning(void *data, char *msg)
1576 1790
1577static int backtrace_stack(void *data, char *name) 1791static int backtrace_stack(void *data, char *name)
1578{ 1792{
1579 /* Process all stacks: */ 1793 per_cpu(in_nmi_frame, smp_processor_id()) =
1794 x86_is_stack_id(NMI_STACK, name);
1795
1580 return 0; 1796 return 0;
1581} 1797}
1582 1798
@@ -1584,6 +1800,9 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
1584{ 1800{
1585 struct perf_callchain_entry *entry = data; 1801 struct perf_callchain_entry *entry = data;
1586 1802
1803 if (per_cpu(in_nmi_frame, smp_processor_id()))
1804 return;
1805
1587 if (reliable) 1806 if (reliable)
1588 callchain_store(entry, addr); 1807 callchain_store(entry, addr);
1589} 1808}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 5c481f6205bf..e60ed740d2b3 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -803,8 +803,3 @@ int __kprobes lapic_wd_event(unsigned nmi_hz)
803 wd_ops->rearm(wd, nmi_hz); 803 wd_ops->rearm(wd, nmi_hz);
804 return 1; 804 return 1;
805} 805}
806
807int lapic_watchdog_ok(void)
808{
809 return wd_ops != NULL;
810}
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index d593cd1f58dc..bca5fba91c9e 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -19,6 +19,12 @@
19 19
20#include "dumpstack.h" 20#include "dumpstack.h"
21 21
22/* Just a stub for now */
23int x86_is_stack_id(int id, char *name)
24{
25 return 0;
26}
27
22void dump_trace(struct task_struct *task, struct pt_regs *regs, 28void dump_trace(struct task_struct *task, struct pt_regs *regs,
23 unsigned long *stack, unsigned long bp, 29 unsigned long *stack, unsigned long bp,
24 const struct stacktrace_ops *ops, void *data) 30 const struct stacktrace_ops *ops, void *data)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index d35db5993fd6..54b0a3276766 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -19,10 +19,8 @@
19 19
20#include "dumpstack.h" 20#include "dumpstack.h"
21 21
22static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 22
23 unsigned *usedp, char **idp) 23static char x86_stack_ids[][8] = {
24{
25 static char ids[][8] = {
26 [DEBUG_STACK - 1] = "#DB", 24 [DEBUG_STACK - 1] = "#DB",
27 [NMI_STACK - 1] = "NMI", 25 [NMI_STACK - 1] = "NMI",
28 [DOUBLEFAULT_STACK - 1] = "#DF", 26 [DOUBLEFAULT_STACK - 1] = "#DF",
@@ -33,6 +31,15 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
33 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 31 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
34#endif 32#endif
35 }; 33 };
34
35int x86_is_stack_id(int id, char *name)
36{
37 return x86_stack_ids[id - 1] == name;
38}
39
40static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
41 unsigned *usedp, char **idp)
42{
36 unsigned k; 43 unsigned k;
37 44
38 /* 45 /*
@@ -61,7 +68,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
61 if (*usedp & (1U << k)) 68 if (*usedp & (1U << k))
62 break; 69 break;
63 *usedp |= 1U << k; 70 *usedp |= 1U << k;
64 *idp = ids[k]; 71 *idp = x86_stack_ids[k];
65 return (unsigned long *)end; 72 return (unsigned long *)end;
66 } 73 }
67 /* 74 /*
@@ -81,12 +88,13 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
81 do { 88 do {
82 ++j; 89 ++j;
83 end -= EXCEPTION_STKSZ; 90 end -= EXCEPTION_STKSZ;
84 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); 91 x86_stack_ids[j][4] = '1' +
92 (j - N_EXCEPTION_STACKS);
85 } while (stack < end - EXCEPTION_STKSZ); 93 } while (stack < end - EXCEPTION_STKSZ);
86 if (*usedp & (1U << j)) 94 if (*usedp & (1U << j))
87 break; 95 break;
88 *usedp |= 1U << j; 96 *usedp |= 1U << j;
89 *idp = ids[j]; 97 *idp = x86_stack_ids[j];
90 return (unsigned long *)end; 98 return (unsigned long *)end;
91 } 99 }
92#endif 100#endif
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c4ca89d9aaf4..5cb5725b2bae 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -627,10 +627,9 @@ __init void e820_setup_gap(void)
627#ifdef CONFIG_X86_64 627#ifdef CONFIG_X86_64
628 if (!found) { 628 if (!found) {
629 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; 629 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
630 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " 630 printk(KERN_ERR
631 "address range\n" 631 "PCI: Warning: Cannot find a gap in the 32bit address range\n"
632 KERN_ERR "PCI: Unassigned devices with 32bit resource " 632 "PCI: Unassigned devices with 32bit resource registers may break!\n");
633 "registers may break!\n");
634 } 633 }
635#endif 634#endif
636 635
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 96f7ac0bbf01..19ccf6d0dccf 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -512,7 +512,7 @@ void __init efi_enter_virtual_mode(void)
512 && end_pfn <= max_pfn_mapped)) 512 && end_pfn <= max_pfn_mapped))
513 va = __va(md->phys_addr); 513 va = __va(md->phys_addr);
514 else 514 else
515 va = efi_ioremap(md->phys_addr, size); 515 va = efi_ioremap(md->phys_addr, size, md->type);
516 516
517 md->virt_addr = (u64) (unsigned long) va; 517 md->virt_addr = (u64) (unsigned long) va;
518 518
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 22c3b7828c50..ac0621a7ac3d 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -98,10 +98,14 @@ void __init efi_call_phys_epilog(void)
98 early_runtime_code_mapping_set_exec(0); 98 early_runtime_code_mapping_set_exec(0);
99} 99}
100 100
101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) 101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
102 u32 type)
102{ 103{
103 unsigned long last_map_pfn; 104 unsigned long last_map_pfn;
104 105
106 if (type == EFI_MEMORY_MAPPED_IO)
107 return ioremap(phys_addr, size);
108
105 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); 109 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
106 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) 110 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
107 return NULL; 111 return NULL;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 8663afb56535..0d98a01cbdb2 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -602,7 +602,11 @@ ignore_int:
602#endif 602#endif
603 iret 603 iret
604 604
605.section .cpuinit.data,"wa" 605#ifndef CONFIG_HOTPLUG_CPU
606 __CPUINITDATA
607#else
608 __REFDATA
609#endif
606.align 4 610.align 4
607ENTRY(initial_code) 611ENTRY(initial_code)
608 .long i386_start_kernel 612 .long i386_start_kernel
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 696f0e475c2d..92b7703d3d58 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -187,7 +187,7 @@ static void __init apic_intr_init(void)
187#ifdef CONFIG_X86_THERMAL_VECTOR 187#ifdef CONFIG_X86_THERMAL_VECTOR
188 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 188 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
189#endif 189#endif
190#ifdef CONFIG_X86_THRESHOLD 190#ifdef CONFIG_X86_MCE_THRESHOLD
191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
192#endif 192#endif
193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) 193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a78ecad0c900..c664d515f613 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -200,7 +200,7 @@ static void kvm_leave_lazy_mmu(void)
200 state->mode = paravirt_get_lazy_mode(); 200 state->mode = paravirt_get_lazy_mode();
201} 201}
202 202
203static void paravirt_ops_setup(void) 203static void __init paravirt_ops_setup(void)
204{ 204{
205 pv_info.name = "KVM"; 205 pv_info.name = "KVM";
206 pv_info.paravirt_enabled = 1; 206 pv_info.paravirt_enabled = 1;
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 846510b78a09..2a62d843f015 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -347,7 +347,7 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id)
347 347
348static struct irqaction mfgptirq = { 348static struct irqaction mfgptirq = {
349 .handler = mfgpt_tick, 349 .handler = mfgpt_tick,
350 .flags = IRQF_DISABLED | IRQF_NOBALANCING, 350 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
351 .name = "mfgpt-timer" 351 .name = "mfgpt-timer"
352}; 352};
353 353
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index cfd9f9063896..d2e56b8f48e7 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -675,7 +675,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
675 nommu: 675 nommu:
676 /* Should not happen anymore */ 676 /* Should not happen anymore */
677 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 677 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
678 KERN_WARNING "falling back to iommu=soft.\n"); 678 "falling back to iommu=soft.\n");
679 return -1; 679 return -1;
680} 680}
681 681
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 4f9c55f3a7c0..03801f2f761f 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -60,7 +60,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
60 "adc %5,%%edx ; " 60 "adc %5,%%edx ; "
61 : "=A" (product), "=r" (tmp1), "=r" (tmp2) 61 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
62 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); 62 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
63#elif __x86_64__ 63#elif defined(__x86_64__)
64 __asm__ ( 64 __asm__ (
65 "mul %%rdx ; shrd $32,%%rdx,%%rax" 65 "mul %%rdx ; shrd $32,%%rdx,%%rax"
66 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); 66 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d2d1ce8170f0..834c9da8bf9d 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -3,6 +3,7 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/pm.h> 4#include <linux/pm.h>
5#include <linux/efi.h> 5#include <linux/efi.h>
6#include <linux/dmi.h>
6#include <acpi/reboot.h> 7#include <acpi/reboot.h>
7#include <asm/io.h> 8#include <asm/io.h>
8#include <asm/apic.h> 9#include <asm/apic.h>
@@ -17,7 +18,6 @@
17#include <asm/cpu.h> 18#include <asm/cpu.h>
18 19
19#ifdef CONFIG_X86_32 20#ifdef CONFIG_X86_32
20# include <linux/dmi.h>
21# include <linux/ctype.h> 21# include <linux/ctype.h>
22# include <linux/mc146818rtc.h> 22# include <linux/mc146818rtc.h>
23#else 23#else
@@ -249,6 +249,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
249 DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), 249 DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
250 }, 250 },
251 }, 251 },
252 { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */
253 .callback = set_bios_reboot,
254 .ident = "CompuLab SBC-FITPC2",
255 .matches = {
256 DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"),
257 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
258 },
259 },
252 { } 260 { }
253}; 261};
254 262
@@ -396,6 +404,38 @@ EXPORT_SYMBOL(machine_real_restart);
396 404
397#endif /* CONFIG_X86_32 */ 405#endif /* CONFIG_X86_32 */
398 406
407/*
408 * Apple MacBook5,2 (2009 MacBook) needs reboot=p
409 */
410static int __init set_pci_reboot(const struct dmi_system_id *d)
411{
412 if (reboot_type != BOOT_CF9) {
413 reboot_type = BOOT_CF9;
414 printk(KERN_INFO "%s series board detected. "
415 "Selecting PCI-method for reboots.\n", d->ident);
416 }
417 return 0;
418}
419
420static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
421 { /* Handle problems with rebooting on Apple MacBook5,2 */
422 .callback = set_pci_reboot,
423 .ident = "Apple MacBook",
424 .matches = {
425 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
426 DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5,2"),
427 },
428 },
429 { }
430};
431
432static int __init pci_reboot_init(void)
433{
434 dmi_check_system(pci_reboot_dmi_table);
435 return 0;
436}
437core_initcall(pci_reboot_init);
438
399static inline void kb_wait(void) 439static inline void kb_wait(void)
400{ 440{
401 int i; 441 int i;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index de2cab132844..63f32d220ef2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -672,6 +672,19 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
672 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), 672 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
673 }, 673 },
674 }, 674 },
675 {
676 /*
677 * AMI BIOS with low memory corruption was found on Intel DG45ID board.
678 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
679 * match only DMI_BOARD_NAME and see if there is more bad products
680 * with this vendor.
681 */
682 .callback = dmi_low_memory_corruption,
683 .ident = "AMI BIOS",
684 .matches = {
685 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
686 },
687 },
675#endif 688#endif
676 {} 689 {}
677}; 690};
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 367e87882041..78d185d797de 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -112,11 +112,6 @@ SECTIONS
112 _sdata = .; 112 _sdata = .;
113 DATA_DATA 113 DATA_DATA
114 CONSTRUCTORS 114 CONSTRUCTORS
115
116#ifdef CONFIG_X86_64
117 /* End of data section */
118 _edata = .;
119#endif
120 } :data 115 } :data
121 116
122#ifdef CONFIG_X86_32 117#ifdef CONFIG_X86_32
@@ -156,10 +151,8 @@ SECTIONS
156 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { 151 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
157 *(.data.read_mostly) 152 *(.data.read_mostly)
158 153
159#ifdef CONFIG_X86_32
160 /* End of data section */ 154 /* End of data section */
161 _edata = .; 155 _edata = .;
162#endif
163 } 156 }
164 157
165#ifdef CONFIG_X86_64 158#ifdef CONFIG_X86_64
@@ -400,8 +393,8 @@ SECTIONS
400 393
401 394
402#ifdef CONFIG_X86_32 395#ifdef CONFIG_X86_32
403ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), 396. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
404 "kernel image bigger than KERNEL_IMAGE_SIZE") 397 "kernel image bigger than KERNEL_IMAGE_SIZE");
405#else 398#else
406/* 399/*
407 * Per-cpu symbols which need to be offset from __per_cpu_load 400 * Per-cpu symbols which need to be offset from __per_cpu_load
@@ -414,12 +407,12 @@ INIT_PER_CPU(irq_stack_union);
414/* 407/*
415 * Build-time check on the image size: 408 * Build-time check on the image size:
416 */ 409 */
417ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), 410. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
418 "kernel image bigger than KERNEL_IMAGE_SIZE") 411 "kernel image bigger than KERNEL_IMAGE_SIZE");
419 412
420#ifdef CONFIG_SMP 413#ifdef CONFIG_SMP
421ASSERT((per_cpu__irq_stack_union == 0), 414. = ASSERT((per_cpu__irq_stack_union == 0),
422 "irq_stack_union is not at start of per-cpu area"); 415 "irq_stack_union is not at start of per-cpu area");
423#endif 416#endif
424 417
425#endif /* CONFIG_X86_32 */ 418#endif /* CONFIG_X86_32 */
@@ -427,7 +420,7 @@ ASSERT((per_cpu__irq_stack_union == 0),
427#ifdef CONFIG_KEXEC 420#ifdef CONFIG_KEXEC
428#include <asm/kexec.h> 421#include <asm/kexec.h>
429 422
430ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, 423. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
431 "kexec control code size is too big") 424 "kexec control code size is too big");
432#endif 425#endif
433 426
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 7bc65f0f62c4..d677fa9ca650 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -22,7 +22,8 @@
22 * 22 *
23 * So how does the kernel know it's a Guest? We'll see that later, but let's 23 * So how does the kernel know it's a Guest? We'll see that later, but let's
24 * just say that we end up here where we replace the native functions various 24 * just say that we end up here where we replace the native functions various
25 * "paravirt" structures with our Guest versions, then boot like normal. :*/ 25 * "paravirt" structures with our Guest versions, then boot like normal.
26:*/
26 27
27/* 28/*
28 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. 29 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
@@ -74,7 +75,8 @@
74 * 75 *
75 * The Guest in our tale is a simple creature: identical to the Host but 76 * The Guest in our tale is a simple creature: identical to the Host but
76 * behaving in simplified but equivalent ways. In particular, the Guest is the 77 * behaving in simplified but equivalent ways. In particular, the Guest is the
77 * same kernel as the Host (or at least, built from the same source code). :*/ 78 * same kernel as the Host (or at least, built from the same source code).
79:*/
78 80
79struct lguest_data lguest_data = { 81struct lguest_data lguest_data = {
80 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, 82 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
@@ -85,7 +87,8 @@ struct lguest_data lguest_data = {
85 .syscall_vec = SYSCALL_VECTOR, 87 .syscall_vec = SYSCALL_VECTOR,
86}; 88};
87 89
88/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a 90/*G:037
91 * async_hcall() is pretty simple: I'm quite proud of it really. We have a
89 * ring buffer of stored hypercalls which the Host will run though next time we 92 * ring buffer of stored hypercalls which the Host will run though next time we
90 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall 93 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
91 * arguments, and a "hcall_status" word which is 0 if the call is ready to go, 94 * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
@@ -94,7 +97,8 @@ struct lguest_data lguest_data = {
94 * If we come around to a slot which hasn't been finished, then the table is 97 * If we come around to a slot which hasn't been finished, then the table is
95 * full and we just make the hypercall directly. This has the nice side 98 * full and we just make the hypercall directly. This has the nice side
96 * effect of causing the Host to run all the stored calls in the ring buffer 99 * effect of causing the Host to run all the stored calls in the ring buffer
97 * which empties it for next time! */ 100 * which empties it for next time!
101 */
98static void async_hcall(unsigned long call, unsigned long arg1, 102static void async_hcall(unsigned long call, unsigned long arg1,
99 unsigned long arg2, unsigned long arg3, 103 unsigned long arg2, unsigned long arg3,
100 unsigned long arg4) 104 unsigned long arg4)
@@ -103,9 +107,11 @@ static void async_hcall(unsigned long call, unsigned long arg1,
103 static unsigned int next_call; 107 static unsigned int next_call;
104 unsigned long flags; 108 unsigned long flags;
105 109
106 /* Disable interrupts if not already disabled: we don't want an 110 /*
111 * Disable interrupts if not already disabled: we don't want an
107 * interrupt handler making a hypercall while we're already doing 112 * interrupt handler making a hypercall while we're already doing
108 * one! */ 113 * one!
114 */
109 local_irq_save(flags); 115 local_irq_save(flags);
110 if (lguest_data.hcall_status[next_call] != 0xFF) { 116 if (lguest_data.hcall_status[next_call] != 0xFF) {
111 /* Table full, so do normal hcall which will flush table. */ 117 /* Table full, so do normal hcall which will flush table. */
@@ -125,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1,
125 local_irq_restore(flags); 131 local_irq_restore(flags);
126} 132}
127 133
128/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first 134/*G:035
129 * real optimization trick! 135 * Notice the lazy_hcall() above, rather than hcall(). This is our first real
136 * optimization trick!
130 * 137 *
131 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do 138 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
132 * them as a batch when lazy_mode is eventually turned off. Because hypercalls 139 * them as a batch when lazy_mode is eventually turned off. Because hypercalls
@@ -136,7 +143,8 @@ static void async_hcall(unsigned long call, unsigned long arg1,
136 * lguest_leave_lazy_mode(). 143 * lguest_leave_lazy_mode().
137 * 144 *
138 * So, when we're in lazy mode, we call async_hcall() to store the call for 145 * So, when we're in lazy mode, we call async_hcall() to store the call for
139 * future processing: */ 146 * future processing:
147 */
140static void lazy_hcall1(unsigned long call, 148static void lazy_hcall1(unsigned long call,
141 unsigned long arg1) 149 unsigned long arg1)
142{ 150{
@@ -146,6 +154,7 @@ static void lazy_hcall1(unsigned long call,
146 async_hcall(call, arg1, 0, 0, 0); 154 async_hcall(call, arg1, 0, 0, 0);
147} 155}
148 156
157/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
149static void lazy_hcall2(unsigned long call, 158static void lazy_hcall2(unsigned long call,
150 unsigned long arg1, 159 unsigned long arg1,
151 unsigned long arg2) 160 unsigned long arg2)
@@ -181,8 +190,10 @@ static void lazy_hcall4(unsigned long call,
181} 190}
182#endif 191#endif
183 192
184/* When lazy mode is turned off reset the per-cpu lazy mode variable and then 193/*G:036
185 * issue the do-nothing hypercall to flush any stored calls. */ 194 * When lazy mode is turned off reset the per-cpu lazy mode variable and then
195 * issue the do-nothing hypercall to flush any stored calls.
196:*/
186static void lguest_leave_lazy_mmu_mode(void) 197static void lguest_leave_lazy_mmu_mode(void)
187{ 198{
188 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 199 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
@@ -208,9 +219,11 @@ static void lguest_end_context_switch(struct task_struct *next)
208 * check there before it tries to deliver an interrupt. 219 * check there before it tries to deliver an interrupt.
209 */ 220 */
210 221
211/* save_flags() is expected to return the processor state (ie. "flags"). The 222/*
223 * save_flags() is expected to return the processor state (ie. "flags"). The
212 * flags word contains all kind of stuff, but in practice Linux only cares 224 * flags word contains all kind of stuff, but in practice Linux only cares
213 * about the interrupt flag. Our "save_flags()" just returns that. */ 225 * about the interrupt flag. Our "save_flags()" just returns that.
226 */
214static unsigned long save_fl(void) 227static unsigned long save_fl(void)
215{ 228{
216 return lguest_data.irq_enabled; 229 return lguest_data.irq_enabled;
@@ -222,13 +235,15 @@ static void irq_disable(void)
222 lguest_data.irq_enabled = 0; 235 lguest_data.irq_enabled = 0;
223} 236}
224 237
225/* Let's pause a moment. Remember how I said these are called so often? 238/*
239 * Let's pause a moment. Remember how I said these are called so often?
226 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to 240 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
227 * break some rules. In particular, these functions are assumed to save their 241 * break some rules. In particular, these functions are assumed to save their
228 * own registers if they need to: normal C functions assume they can trash the 242 * own registers if they need to: normal C functions assume they can trash the
229 * eax register. To use normal C functions, we use 243 * eax register. To use normal C functions, we use
230 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the 244 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
231 * C function, then restores it. */ 245 * C function, then restores it.
246 */
232PV_CALLEE_SAVE_REGS_THUNK(save_fl); 247PV_CALLEE_SAVE_REGS_THUNK(save_fl);
233PV_CALLEE_SAVE_REGS_THUNK(irq_disable); 248PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
234/*:*/ 249/*:*/
@@ -237,18 +252,18 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
237extern void lg_irq_enable(void); 252extern void lg_irq_enable(void);
238extern void lg_restore_fl(unsigned long flags); 253extern void lg_restore_fl(unsigned long flags);
239 254
240/*M:003 Note that we don't check for outstanding interrupts when we re-enable 255/*M:003
241 * them (or when we unmask an interrupt). This seems to work for the moment, 256 * We could be more efficient in our checking of outstanding interrupts, rather
242 * since interrupts are rare and we'll just get the interrupt on the next timer 257 * than using a branch. One way would be to put the "irq_enabled" field in a
243 * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way 258 * page by itself, and have the Host write-protect it when an interrupt comes
244 * would be to put the "irq_enabled" field in a page by itself, and have the 259 * in when irqs are disabled. There will then be a page fault as soon as
245 * Host write-protect it when an interrupt comes in when irqs are disabled. 260 * interrupts are re-enabled.
246 * There will then be a page fault as soon as interrupts are re-enabled.
247 * 261 *
248 * A better method is to implement soft interrupt disable generally for x86: 262 * A better method is to implement soft interrupt disable generally for x86:
249 * instead of disabling interrupts, we set a flag. If an interrupt does come 263 * instead of disabling interrupts, we set a flag. If an interrupt does come
250 * in, we then disable them for real. This is uncommon, so we could simply use 264 * in, we then disable them for real. This is uncommon, so we could simply use
251 * a hypercall for interrupt control and not worry about efficiency. :*/ 265 * a hypercall for interrupt control and not worry about efficiency.
266:*/
252 267
253/*G:034 268/*G:034
254 * The Interrupt Descriptor Table (IDT). 269 * The Interrupt Descriptor Table (IDT).
@@ -261,10 +276,12 @@ extern void lg_restore_fl(unsigned long flags);
261static void lguest_write_idt_entry(gate_desc *dt, 276static void lguest_write_idt_entry(gate_desc *dt,
262 int entrynum, const gate_desc *g) 277 int entrynum, const gate_desc *g)
263{ 278{
264 /* The gate_desc structure is 8 bytes long: we hand it to the Host in 279 /*
280 * The gate_desc structure is 8 bytes long: we hand it to the Host in
265 * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors 281 * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors
266 * around like this; typesafety wasn't a big concern in Linux's early 282 * around like this; typesafety wasn't a big concern in Linux's early
267 * years. */ 283 * years.
284 */
268 u32 *desc = (u32 *)g; 285 u32 *desc = (u32 *)g;
269 /* Keep the local copy up to date. */ 286 /* Keep the local copy up to date. */
270 native_write_idt_entry(dt, entrynum, g); 287 native_write_idt_entry(dt, entrynum, g);
@@ -272,9 +289,11 @@ static void lguest_write_idt_entry(gate_desc *dt,
272 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); 289 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
273} 290}
274 291
275/* Changing to a different IDT is very rare: we keep the IDT up-to-date every 292/*
293 * Changing to a different IDT is very rare: we keep the IDT up-to-date every
276 * time it is written, so we can simply loop through all entries and tell the 294 * time it is written, so we can simply loop through all entries and tell the
277 * Host about them. */ 295 * Host about them.
296 */
278static void lguest_load_idt(const struct desc_ptr *desc) 297static void lguest_load_idt(const struct desc_ptr *desc)
279{ 298{
280 unsigned int i; 299 unsigned int i;
@@ -305,9 +324,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc)
305 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); 324 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b);
306} 325}
307 326
308/* For a single GDT entry which changes, we do the lazy thing: alter our GDT, 327/*
328 * For a single GDT entry which changes, we do the lazy thing: alter our GDT,
309 * then tell the Host to reload the entire thing. This operation is so rare 329 * then tell the Host to reload the entire thing. This operation is so rare
310 * that this naive implementation is reasonable. */ 330 * that this naive implementation is reasonable.
331 */
311static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, 332static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
312 const void *desc, int type) 333 const void *desc, int type)
313{ 334{
@@ -317,29 +338,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
317 dt[entrynum].a, dt[entrynum].b); 338 dt[entrynum].a, dt[entrynum].b);
318} 339}
319 340
320/* OK, I lied. There are three "thread local storage" GDT entries which change 341/*
342 * OK, I lied. There are three "thread local storage" GDT entries which change
321 * on every context switch (these three entries are how glibc implements 343 * on every context switch (these three entries are how glibc implements
322 * __thread variables). So we have a hypercall specifically for this case. */ 344 * __thread variables). So we have a hypercall specifically for this case.
345 */
323static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) 346static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
324{ 347{
325 /* There's one problem which normal hardware doesn't have: the Host 348 /*
349 * There's one problem which normal hardware doesn't have: the Host
326 * can't handle us removing entries we're currently using. So we clear 350 * can't handle us removing entries we're currently using. So we clear
327 * the GS register here: if it's needed it'll be reloaded anyway. */ 351 * the GS register here: if it's needed it'll be reloaded anyway.
352 */
328 lazy_load_gs(0); 353 lazy_load_gs(0);
329 lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); 354 lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
330} 355}
331 356
332/*G:038 That's enough excitement for now, back to ploughing through each of 357/*G:038
333 * the different pv_ops structures (we're about 1/3 of the way through). 358 * That's enough excitement for now, back to ploughing through each of the
359 * different pv_ops structures (we're about 1/3 of the way through).
334 * 360 *
335 * This is the Local Descriptor Table, another weird Intel thingy. Linux only 361 * This is the Local Descriptor Table, another weird Intel thingy. Linux only
336 * uses this for some strange applications like Wine. We don't do anything 362 * uses this for some strange applications like Wine. We don't do anything
337 * here, so they'll get an informative and friendly Segmentation Fault. */ 363 * here, so they'll get an informative and friendly Segmentation Fault.
364 */
338static void lguest_set_ldt(const void *addr, unsigned entries) 365static void lguest_set_ldt(const void *addr, unsigned entries)
339{ 366{
340} 367}
341 368
342/* This loads a GDT entry into the "Task Register": that entry points to a 369/*
370 * This loads a GDT entry into the "Task Register": that entry points to a
343 * structure called the Task State Segment. Some comments scattered though the 371 * structure called the Task State Segment. Some comments scattered though the
344 * kernel code indicate that this used for task switching in ages past, along 372 * kernel code indicate that this used for task switching in ages past, along
345 * with blood sacrifice and astrology. 373 * with blood sacrifice and astrology.
@@ -347,19 +375,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries)
347 * Now there's nothing interesting in here that we don't get told elsewhere. 375 * Now there's nothing interesting in here that we don't get told elsewhere.
348 * But the native version uses the "ltr" instruction, which makes the Host 376 * But the native version uses the "ltr" instruction, which makes the Host
349 * complain to the Guest about a Segmentation Fault and it'll oops. So we 377 * complain to the Guest about a Segmentation Fault and it'll oops. So we
350 * override the native version with a do-nothing version. */ 378 * override the native version with a do-nothing version.
379 */
351static void lguest_load_tr_desc(void) 380static void lguest_load_tr_desc(void)
352{ 381{
353} 382}
354 383
355/* The "cpuid" instruction is a way of querying both the CPU identity 384/*
385 * The "cpuid" instruction is a way of querying both the CPU identity
356 * (manufacturer, model, etc) and its features. It was introduced before the 386 * (manufacturer, model, etc) and its features. It was introduced before the
357 * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. 387 * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
358 * As you might imagine, after a decade and a half this treatment, it is now a 388 * As you might imagine, after a decade and a half this treatment, it is now a
359 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. 389 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
360 * 390 *
361 * This instruction even it has its own Wikipedia entry. The Wikipedia entry 391 * This instruction even it has its own Wikipedia entry. The Wikipedia entry
362 * has been translated into 4 languages. I am not making this up! 392 * has been translated into 5 languages. I am not making this up!
363 * 393 *
364 * We could get funky here and identify ourselves as "GenuineLguest", but 394 * We could get funky here and identify ourselves as "GenuineLguest", but
365 * instead we just use the real "cpuid" instruction. Then I pretty much turned 395 * instead we just use the real "cpuid" instruction. Then I pretty much turned
@@ -371,7 +401,8 @@ static void lguest_load_tr_desc(void)
371 * Replacing the cpuid so we can turn features off is great for the kernel, but 401 * Replacing the cpuid so we can turn features off is great for the kernel, but
372 * anyone (including userspace) can just use the raw "cpuid" instruction and 402 * anyone (including userspace) can just use the raw "cpuid" instruction and
373 * the Host won't even notice since it isn't privileged. So we try not to get 403 * the Host won't even notice since it isn't privileged. So we try not to get
374 * too worked up about it. */ 404 * too worked up about it.
405 */
375static void lguest_cpuid(unsigned int *ax, unsigned int *bx, 406static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
376 unsigned int *cx, unsigned int *dx) 407 unsigned int *cx, unsigned int *dx)
377{ 408{
@@ -379,38 +410,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
379 410
380 native_cpuid(ax, bx, cx, dx); 411 native_cpuid(ax, bx, cx, dx);
381 switch (function) { 412 switch (function) {
382 case 1: /* Basic feature request. */ 413 /*
383 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 414 * CPUID 0 gives the highest legal CPUID number (and the ID string).
415 * We futureproof our code a little by sticking to known CPUID values.
416 */
417 case 0:
418 if (*ax > 5)
419 *ax = 5;
420 break;
421
422 /*
423 * CPUID 1 is a basic feature request.
424 *
425 * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
426 * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
427 */
428 case 1:
384 *cx &= 0x00002201; 429 *cx &= 0x00002201;
385 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */
386 *dx &= 0x07808151; 430 *dx &= 0x07808151;
387 /* The Host can do a nice optimization if it knows that the 431 /*
432 * The Host can do a nice optimization if it knows that the
388 * kernel mappings (addresses above 0xC0000000 or whatever 433 * kernel mappings (addresses above 0xC0000000 or whatever
389 * PAGE_OFFSET is set to) haven't changed. But Linux calls 434 * PAGE_OFFSET is set to) haven't changed. But Linux calls
390 * flush_tlb_user() for both user and kernel mappings unless 435 * flush_tlb_user() for both user and kernel mappings unless
391 * the Page Global Enable (PGE) feature bit is set. */ 436 * the Page Global Enable (PGE) feature bit is set.
437 */
392 *dx |= 0x00002000; 438 *dx |= 0x00002000;
393 /* We also lie, and say we're family id 5. 6 or greater 439 /*
440 * We also lie, and say we're family id 5. 6 or greater
394 * leads to a rdmsr in early_init_intel which we can't handle. 441 * leads to a rdmsr in early_init_intel which we can't handle.
395 * Family ID is returned as bits 8-12 in ax. */ 442 * Family ID is returned as bits 8-12 in ax.
443 */
396 *ax &= 0xFFFFF0FF; 444 *ax &= 0xFFFFF0FF;
397 *ax |= 0x00000500; 445 *ax |= 0x00000500;
398 break; 446 break;
447 /*
448 * 0x80000000 returns the highest Extended Function, so we futureproof
449 * like we do above by limiting it to known fields.
450 */
399 case 0x80000000: 451 case 0x80000000:
400 /* Futureproof this a little: if they ask how much extended
401 * processor information there is, limit it to known fields. */
402 if (*ax > 0x80000008) 452 if (*ax > 0x80000008)
403 *ax = 0x80000008; 453 *ax = 0x80000008;
404 break; 454 break;
455
456 /*
457 * PAE systems can mark pages as non-executable. Linux calls this the
458 * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
459 * Virus Protection). We just switch turn if off here, since we don't
460 * support it.
461 */
405 case 0x80000001: 462 case 0x80000001:
406 /* Here we should fix nx cap depending on host. */
407 /* For this version of PAE, we just clear NX bit. */
408 *dx &= ~(1 << 20); 463 *dx &= ~(1 << 20);
409 break; 464 break;
410 } 465 }
411} 466}
412 467
413/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. 468/*
469 * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
414 * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother 470 * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
415 * it. The Host needs to know when the Guest wants to change them, so we have 471 * it. The Host needs to know when the Guest wants to change them, so we have
416 * a whole series of functions like read_cr0() and write_cr0(). 472 * a whole series of functions like read_cr0() and write_cr0().
@@ -425,7 +481,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
425 * name like "FPUTRAP bit" be a little less cryptic? 481 * name like "FPUTRAP bit" be a little less cryptic?
426 * 482 *
427 * We store cr0 locally because the Host never changes it. The Guest sometimes 483 * We store cr0 locally because the Host never changes it. The Guest sometimes
428 * wants to read it and we'd prefer not to bother the Host unnecessarily. */ 484 * wants to read it and we'd prefer not to bother the Host unnecessarily.
485 */
429static unsigned long current_cr0; 486static unsigned long current_cr0;
430static void lguest_write_cr0(unsigned long val) 487static void lguest_write_cr0(unsigned long val)
431{ 488{
@@ -438,18 +495,22 @@ static unsigned long lguest_read_cr0(void)
438 return current_cr0; 495 return current_cr0;
439} 496}
440 497
441/* Intel provided a special instruction to clear the TS bit for people too cool 498/*
499 * Intel provided a special instruction to clear the TS bit for people too cool
442 * to use write_cr0() to do it. This "clts" instruction is faster, because all 500 * to use write_cr0() to do it. This "clts" instruction is faster, because all
443 * the vowels have been optimized out. */ 501 * the vowels have been optimized out.
502 */
444static void lguest_clts(void) 503static void lguest_clts(void)
445{ 504{
446 lazy_hcall1(LHCALL_TS, 0); 505 lazy_hcall1(LHCALL_TS, 0);
447 current_cr0 &= ~X86_CR0_TS; 506 current_cr0 &= ~X86_CR0_TS;
448} 507}
449 508
450/* cr2 is the virtual address of the last page fault, which the Guest only ever 509/*
510 * cr2 is the virtual address of the last page fault, which the Guest only ever
451 * reads. The Host kindly writes this into our "struct lguest_data", so we 511 * reads. The Host kindly writes this into our "struct lguest_data", so we
452 * just read it out of there. */ 512 * just read it out of there.
513 */
453static unsigned long lguest_read_cr2(void) 514static unsigned long lguest_read_cr2(void)
454{ 515{
455 return lguest_data.cr2; 516 return lguest_data.cr2;
@@ -458,10 +519,12 @@ static unsigned long lguest_read_cr2(void)
458/* See lguest_set_pte() below. */ 519/* See lguest_set_pte() below. */
459static bool cr3_changed = false; 520static bool cr3_changed = false;
460 521
461/* cr3 is the current toplevel pagetable page: the principle is the same as 522/*
523 * cr3 is the current toplevel pagetable page: the principle is the same as
462 * cr0. Keep a local copy, and tell the Host when it changes. The only 524 * cr0. Keep a local copy, and tell the Host when it changes. The only
463 * difference is that our local copy is in lguest_data because the Host needs 525 * difference is that our local copy is in lguest_data because the Host needs
464 * to set it upon our initial hypercall. */ 526 * to set it upon our initial hypercall.
527 */
465static void lguest_write_cr3(unsigned long cr3) 528static void lguest_write_cr3(unsigned long cr3)
466{ 529{
467 lguest_data.pgdir = cr3; 530 lguest_data.pgdir = cr3;
@@ -506,7 +569,7 @@ static void lguest_write_cr4(unsigned long val)
506 * cr3 ---> +---------+ 569 * cr3 ---> +---------+
507 * | --------->+---------+ 570 * | --------->+---------+
508 * | | | PADDR1 | 571 * | | | PADDR1 |
509 * Top-level | | PADDR2 | 572 * Mid-level | | PADDR2 |
510 * (PMD) page | | | 573 * (PMD) page | | |
511 * | | Lower-level | 574 * | | Lower-level |
512 * | | (PTE) page | 575 * | | (PTE) page |
@@ -526,21 +589,62 @@ static void lguest_write_cr4(unsigned long val)
526 * Index into top Index into second Offset within page 589 * Index into top Index into second Offset within page
527 * page directory page pagetable page 590 * page directory page pagetable page
528 * 591 *
529 * The kernel spends a lot of time changing both the top-level page directory 592 * Now, unfortunately, this isn't the whole story: Intel added Physical Address
530 * and lower-level pagetable pages. The Guest doesn't know physical addresses, 593 * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
531 * so while it maintains these page tables exactly like normal, it also needs 594 * These are held in 64-bit page table entries, so we can now only fit 512
532 * to keep the Host informed whenever it makes a change: the Host will create 595 * entries in a page, and the neat three-level tree breaks down.
533 * the real page tables based on the Guests'. 596 *
597 * The result is a four level page table:
598 *
599 * cr3 --> [ 4 Upper ]
600 * [ Level ]
601 * [ Entries ]
602 * [(PUD Page)]---> +---------+
603 * | --------->+---------+
604 * | | | PADDR1 |
605 * Mid-level | | PADDR2 |
606 * (PMD) page | | |
607 * | | Lower-level |
608 * | | (PTE) page |
609 * | | | |
610 * .... ....
611 *
612 *
613 * And the virtual address is decoded as:
614 *
615 * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
616 * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
617 * Index into Index into mid Index into lower Offset within page
618 * top entries directory page pagetable page
619 *
620 * It's too hard to switch between these two formats at runtime, so Linux only
621 * supports one or the other depending on whether CONFIG_X86_PAE is set. Many
622 * distributions turn it on, and not just for people with silly amounts of
623 * memory: the larger PTE entries allow room for the NX bit, which lets the
624 * kernel disable execution of pages and increase security.
625 *
626 * This was a problem for lguest, which couldn't run on these distributions;
627 * then Matias Zabaljauregui figured it all out and implemented it, and only a
628 * handful of puppies were crushed in the process!
629 *
630 * Back to our point: the kernel spends a lot of time changing both the
631 * top-level page directory and lower-level pagetable pages. The Guest doesn't
632 * know physical addresses, so while it maintains these page tables exactly
633 * like normal, it also needs to keep the Host informed whenever it makes a
634 * change: the Host will create the real page tables based on the Guests'.
534 */ 635 */
535 636
536/* The Guest calls this to set a second-level entry (pte), ie. to map a page 637/*
537 * into a process' address space. We set the entry then tell the Host the 638 * The Guest calls this after it has set a second-level entry (pte), ie. to map
538 * toplevel and address this corresponds to. The Guest uses one pagetable per 639 * a page into a process' address space. Wetell the Host the toplevel and
539 * process, so we need to tell the Host which one we're changing (mm->pgd). */ 640 * address this corresponds to. The Guest uses one pagetable per process, so
641 * we need to tell the Host which one we're changing (mm->pgd).
642 */
540static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 643static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
541 pte_t *ptep) 644 pte_t *ptep)
542{ 645{
543#ifdef CONFIG_X86_PAE 646#ifdef CONFIG_X86_PAE
647 /* PAE needs to hand a 64 bit page table entry, so it uses two args. */
544 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, 648 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
545 ptep->pte_low, ptep->pte_high); 649 ptep->pte_low, ptep->pte_high);
546#else 650#else
@@ -548,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
548#endif 652#endif
549} 653}
550 654
655/* This is the "set and update" combo-meal-deal version. */
551static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 656static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
552 pte_t *ptep, pte_t pteval) 657 pte_t *ptep, pte_t pteval)
553{ 658{
@@ -555,10 +660,13 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
555 lguest_pte_update(mm, addr, ptep); 660 lguest_pte_update(mm, addr, ptep);
556} 661}
557 662
558/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd 663/*
664 * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
559 * to set a middle-level entry when PAE is activated. 665 * to set a middle-level entry when PAE is activated.
666 *
560 * Again, we set the entry then tell the Host which page we changed, 667 * Again, we set the entry then tell the Host which page we changed,
561 * and the index of the entry we changed. */ 668 * and the index of the entry we changed.
669 */
562#ifdef CONFIG_X86_PAE 670#ifdef CONFIG_X86_PAE
563static void lguest_set_pud(pud_t *pudp, pud_t pudval) 671static void lguest_set_pud(pud_t *pudp, pud_t pudval)
564{ 672{
@@ -577,8 +685,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
577} 685}
578#else 686#else
579 687
580/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not 688/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
581 * activated. */
582static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 689static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
583{ 690{
584 native_set_pmd(pmdp, pmdval); 691 native_set_pmd(pmdp, pmdval);
@@ -587,7 +694,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
587} 694}
588#endif 695#endif
589 696
590/* There are a couple of legacy places where the kernel sets a PTE, but we 697/*
698 * There are a couple of legacy places where the kernel sets a PTE, but we
591 * don't know the top level any more. This is useless for us, since we don't 699 * don't know the top level any more. This is useless for us, since we don't
592 * know which pagetable is changing or what address, so we just tell the Host 700 * know which pagetable is changing or what address, so we just tell the Host
593 * to forget all of them. Fortunately, this is very rare. 701 * to forget all of them. Fortunately, this is very rare.
@@ -595,7 +703,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
595 * ... except in early boot when the kernel sets up the initial pagetables, 703 * ... except in early boot when the kernel sets up the initial pagetables,
596 * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell 704 * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell
597 * the Host anything changed until we've done the first page table switch, 705 * the Host anything changed until we've done the first page table switch,
598 * which brings boot back to 0.25 seconds. */ 706 * which brings boot back to 0.25 seconds.
707 */
599static void lguest_set_pte(pte_t *ptep, pte_t pteval) 708static void lguest_set_pte(pte_t *ptep, pte_t pteval)
600{ 709{
601 native_set_pte(ptep, pteval); 710 native_set_pte(ptep, pteval);
@@ -604,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
604} 713}
605 714
606#ifdef CONFIG_X86_PAE 715#ifdef CONFIG_X86_PAE
716/*
717 * With 64-bit PTE values, we need to be careful setting them: if we set 32
718 * bits at a time, the hardware could see a weird half-set entry. These
719 * versions ensure we update all 64 bits at once.
720 */
607static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) 721static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
608{ 722{
609 native_set_pte_atomic(ptep, pte); 723 native_set_pte_atomic(ptep, pte);
@@ -611,19 +725,21 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
611 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 725 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
612} 726}
613 727
614void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 728static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
729 pte_t *ptep)
615{ 730{
616 native_pte_clear(mm, addr, ptep); 731 native_pte_clear(mm, addr, ptep);
617 lguest_pte_update(mm, addr, ptep); 732 lguest_pte_update(mm, addr, ptep);
618} 733}
619 734
620void lguest_pmd_clear(pmd_t *pmdp) 735static void lguest_pmd_clear(pmd_t *pmdp)
621{ 736{
622 lguest_set_pmd(pmdp, __pmd(0)); 737 lguest_set_pmd(pmdp, __pmd(0));
623} 738}
624#endif 739#endif
625 740
626/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 741/*
742 * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
627 * native page table operations. On native hardware you can set a new page 743 * native page table operations. On native hardware you can set a new page
628 * table entry whenever you want, but if you want to remove one you have to do 744 * table entry whenever you want, but if you want to remove one you have to do
629 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). 745 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
@@ -632,24 +748,29 @@ void lguest_pmd_clear(pmd_t *pmdp)
632 * called when a valid entry is written, not when it's removed (ie. marked not 748 * called when a valid entry is written, not when it's removed (ie. marked not
633 * present). Instead, this is where we come when the Guest wants to remove a 749 * present). Instead, this is where we come when the Guest wants to remove a
634 * page table entry: we tell the Host to set that entry to 0 (ie. the present 750 * page table entry: we tell the Host to set that entry to 0 (ie. the present
635 * bit is zero). */ 751 * bit is zero).
752 */
636static void lguest_flush_tlb_single(unsigned long addr) 753static void lguest_flush_tlb_single(unsigned long addr)
637{ 754{
638 /* Simply set it to zero: if it was not, it will fault back in. */ 755 /* Simply set it to zero: if it was not, it will fault back in. */
639 lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); 756 lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
640} 757}
641 758
642/* This is what happens after the Guest has removed a large number of entries. 759/*
760 * This is what happens after the Guest has removed a large number of entries.
643 * This tells the Host that any of the page table entries for userspace might 761 * This tells the Host that any of the page table entries for userspace might
644 * have changed, ie. virtual addresses below PAGE_OFFSET. */ 762 * have changed, ie. virtual addresses below PAGE_OFFSET.
763 */
645static void lguest_flush_tlb_user(void) 764static void lguest_flush_tlb_user(void)
646{ 765{
647 lazy_hcall1(LHCALL_FLUSH_TLB, 0); 766 lazy_hcall1(LHCALL_FLUSH_TLB, 0);
648} 767}
649 768
650/* This is called when the kernel page tables have changed. That's not very 769/*
770 * This is called when the kernel page tables have changed. That's not very
651 * common (unless the Guest is using highmem, which makes the Guest extremely 771 * common (unless the Guest is using highmem, which makes the Guest extremely
652 * slow), so it's worth separating this from the user flushing above. */ 772 * slow), so it's worth separating this from the user flushing above.
773 */
653static void lguest_flush_tlb_kernel(void) 774static void lguest_flush_tlb_kernel(void)
654{ 775{
655 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 776 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
@@ -686,26 +807,38 @@ static struct irq_chip lguest_irq_controller = {
686 .unmask = enable_lguest_irq, 807 .unmask = enable_lguest_irq,
687}; 808};
688 809
689/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware 810/*
811 * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
690 * interrupt (except 128, which is used for system calls), and then tells the 812 * interrupt (except 128, which is used for system calls), and then tells the
691 * Linux infrastructure that each interrupt is controlled by our level-based 813 * Linux infrastructure that each interrupt is controlled by our level-based
692 * lguest interrupt controller. */ 814 * lguest interrupt controller.
815 */
693static void __init lguest_init_IRQ(void) 816static void __init lguest_init_IRQ(void)
694{ 817{
695 unsigned int i; 818 unsigned int i;
696 819
697 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 820 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
698 /* Some systems map "vectors" to interrupts weirdly. Lguest has 821 /* Some systems map "vectors" to interrupts weirdly. Not us! */
699 * a straightforward 1 to 1 mapping, so force that here. */
700 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; 822 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
701 if (i != SYSCALL_VECTOR) 823 if (i != SYSCALL_VECTOR)
702 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 824 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
703 } 825 }
704 /* This call is required to set up for 4k stacks, where we have 826
705 * separate stacks for hard and soft interrupts. */ 827 /*
828 * This call is required to set up for 4k stacks, where we have
829 * separate stacks for hard and soft interrupts.
830 */
706 irq_ctx_init(smp_processor_id()); 831 irq_ctx_init(smp_processor_id());
707} 832}
708 833
834/*
835 * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so
836 * rather than set them in lguest_init_IRQ we are called here every time an
837 * lguest device needs an interrupt.
838 *
839 * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should
840 * pass that up!
841 */
709void lguest_setup_irq(unsigned int irq) 842void lguest_setup_irq(unsigned int irq)
710{ 843{
711 irq_to_desc_alloc_node(irq, 0); 844 irq_to_desc_alloc_node(irq, 0);
@@ -724,31 +857,39 @@ static unsigned long lguest_get_wallclock(void)
724 return lguest_data.time.tv_sec; 857 return lguest_data.time.tv_sec;
725} 858}
726 859
727/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us 860/*
861 * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us
728 * what speed it runs at, or 0 if it's unusable as a reliable clock source. 862 * what speed it runs at, or 0 if it's unusable as a reliable clock source.
729 * This matches what we want here: if we return 0 from this function, the x86 863 * This matches what we want here: if we return 0 from this function, the x86
730 * TSC clock will give up and not register itself. */ 864 * TSC clock will give up and not register itself.
865 */
731static unsigned long lguest_tsc_khz(void) 866static unsigned long lguest_tsc_khz(void)
732{ 867{
733 return lguest_data.tsc_khz; 868 return lguest_data.tsc_khz;
734} 869}
735 870
736/* If we can't use the TSC, the kernel falls back to our lower-priority 871/*
737 * "lguest_clock", where we read the time value given to us by the Host. */ 872 * If we can't use the TSC, the kernel falls back to our lower-priority
873 * "lguest_clock", where we read the time value given to us by the Host.
874 */
738static cycle_t lguest_clock_read(struct clocksource *cs) 875static cycle_t lguest_clock_read(struct clocksource *cs)
739{ 876{
740 unsigned long sec, nsec; 877 unsigned long sec, nsec;
741 878
742 /* Since the time is in two parts (seconds and nanoseconds), we risk 879 /*
880 * Since the time is in two parts (seconds and nanoseconds), we risk
743 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, 881 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
744 * and getting 99 and 0. As Linux tends to come apart under the stress 882 * and getting 99 and 0. As Linux tends to come apart under the stress
745 * of time travel, we must be careful: */ 883 * of time travel, we must be careful:
884 */
746 do { 885 do {
747 /* First we read the seconds part. */ 886 /* First we read the seconds part. */
748 sec = lguest_data.time.tv_sec; 887 sec = lguest_data.time.tv_sec;
749 /* This read memory barrier tells the compiler and the CPU that 888 /*
889 * This read memory barrier tells the compiler and the CPU that
750 * this can't be reordered: we have to complete the above 890 * this can't be reordered: we have to complete the above
751 * before going on. */ 891 * before going on.
892 */
752 rmb(); 893 rmb();
753 /* Now we read the nanoseconds part. */ 894 /* Now we read the nanoseconds part. */
754 nsec = lguest_data.time.tv_nsec; 895 nsec = lguest_data.time.tv_nsec;
@@ -772,9 +913,11 @@ static struct clocksource lguest_clock = {
772 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 913 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
773}; 914};
774 915
775/* We also need a "struct clock_event_device": Linux asks us to set it to go 916/*
917 * We also need a "struct clock_event_device": Linux asks us to set it to go
776 * off some time in the future. Actually, James Morris figured all this out, I 918 * off some time in the future. Actually, James Morris figured all this out, I
777 * just applied the patch. */ 919 * just applied the patch.
920 */
778static int lguest_clockevent_set_next_event(unsigned long delta, 921static int lguest_clockevent_set_next_event(unsigned long delta,
779 struct clock_event_device *evt) 922 struct clock_event_device *evt)
780{ 923{
@@ -824,8 +967,10 @@ static struct clock_event_device lguest_clockevent = {
824 .max_delta_ns = LG_CLOCK_MAX_DELTA, 967 .max_delta_ns = LG_CLOCK_MAX_DELTA,
825}; 968};
826 969
827/* This is the Guest timer interrupt handler (hardware interrupt 0). We just 970/*
828 * call the clockevent infrastructure and it does whatever needs doing. */ 971 * This is the Guest timer interrupt handler (hardware interrupt 0). We just
972 * call the clockevent infrastructure and it does whatever needs doing.
973 */
829static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) 974static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
830{ 975{
831 unsigned long flags; 976 unsigned long flags;
@@ -836,10 +981,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
836 local_irq_restore(flags); 981 local_irq_restore(flags);
837} 982}
838 983
839/* At some point in the boot process, we get asked to set up our timing 984/*
985 * At some point in the boot process, we get asked to set up our timing
840 * infrastructure. The kernel doesn't expect timer interrupts before this, but 986 * infrastructure. The kernel doesn't expect timer interrupts before this, but
841 * we cleverly initialized the "blocked_interrupts" field of "struct 987 * we cleverly initialized the "blocked_interrupts" field of "struct
842 * lguest_data" so that timer interrupts were blocked until now. */ 988 * lguest_data" so that timer interrupts were blocked until now.
989 */
843static void lguest_time_init(void) 990static void lguest_time_init(void)
844{ 991{
845 /* Set up the timer interrupt (0) to go to our simple timer routine */ 992 /* Set up the timer interrupt (0) to go to our simple timer routine */
@@ -863,14 +1010,16 @@ static void lguest_time_init(void)
863 * to work. They're pretty simple. 1010 * to work. They're pretty simple.
864 */ 1011 */
865 1012
866/* The Guest needs to tell the Host what stack it expects traps to use. For 1013/*
1014 * The Guest needs to tell the Host what stack it expects traps to use. For
867 * native hardware, this is part of the Task State Segment mentioned above in 1015 * native hardware, this is part of the Task State Segment mentioned above in
868 * lguest_load_tr_desc(), but to help hypervisors there's this special call. 1016 * lguest_load_tr_desc(), but to help hypervisors there's this special call.
869 * 1017 *
870 * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data 1018 * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
871 * segment), the privilege level (we're privilege level 1, the Host is 0 and 1019 * segment), the privilege level (we're privilege level 1, the Host is 0 and
872 * will not tolerate us trying to use that), the stack pointer, and the number 1020 * will not tolerate us trying to use that), the stack pointer, and the number
873 * of pages in the stack. */ 1021 * of pages in the stack.
1022 */
874static void lguest_load_sp0(struct tss_struct *tss, 1023static void lguest_load_sp0(struct tss_struct *tss,
875 struct thread_struct *thread) 1024 struct thread_struct *thread)
876{ 1025{
@@ -884,7 +1033,8 @@ static void lguest_set_debugreg(int regno, unsigned long value)
884 /* FIXME: Implement */ 1033 /* FIXME: Implement */
885} 1034}
886 1035
887/* There are times when the kernel wants to make sure that no memory writes are 1036/*
1037 * There are times when the kernel wants to make sure that no memory writes are
888 * caught in the cache (that they've all reached real hardware devices). This 1038 * caught in the cache (that they've all reached real hardware devices). This
889 * doesn't matter for the Guest which has virtual hardware. 1039 * doesn't matter for the Guest which has virtual hardware.
890 * 1040 *
@@ -898,11 +1048,13 @@ static void lguest_wbinvd(void)
898{ 1048{
899} 1049}
900 1050
901/* If the Guest expects to have an Advanced Programmable Interrupt Controller, 1051/*
1052 * If the Guest expects to have an Advanced Programmable Interrupt Controller,
902 * we play dumb by ignoring writes and returning 0 for reads. So it's no 1053 * we play dumb by ignoring writes and returning 0 for reads. So it's no
903 * longer Programmable nor Controlling anything, and I don't think 8 lines of 1054 * longer Programmable nor Controlling anything, and I don't think 8 lines of
904 * code qualifies for Advanced. It will also never interrupt anything. It 1055 * code qualifies for Advanced. It will also never interrupt anything. It
905 * does, however, allow us to get through the Linux boot code. */ 1056 * does, however, allow us to get through the Linux boot code.
1057 */
906#ifdef CONFIG_X86_LOCAL_APIC 1058#ifdef CONFIG_X86_LOCAL_APIC
907static void lguest_apic_write(u32 reg, u32 v) 1059static void lguest_apic_write(u32 reg, u32 v)
908{ 1060{
@@ -951,11 +1103,13 @@ static void lguest_safe_halt(void)
951 kvm_hypercall0(LHCALL_HALT); 1103 kvm_hypercall0(LHCALL_HALT);
952} 1104}
953 1105
954/* The SHUTDOWN hypercall takes a string to describe what's happening, and 1106/*
1107 * The SHUTDOWN hypercall takes a string to describe what's happening, and
955 * an argument which says whether this to restart (reboot) the Guest or not. 1108 * an argument which says whether this to restart (reboot) the Guest or not.
956 * 1109 *
957 * Note that the Host always prefers that the Guest speak in physical addresses 1110 * Note that the Host always prefers that the Guest speak in physical addresses
958 * rather than virtual addresses, so we use __pa() here. */ 1111 * rather than virtual addresses, so we use __pa() here.
1112 */
959static void lguest_power_off(void) 1113static void lguest_power_off(void)
960{ 1114{
961 kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), 1115 kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"),
@@ -986,8 +1140,10 @@ static __init char *lguest_memory_setup(void)
986 * nice to move it back to lguest_init. Patch welcome... */ 1140 * nice to move it back to lguest_init. Patch welcome... */
987 atomic_notifier_chain_register(&panic_notifier_list, &paniced); 1141 atomic_notifier_chain_register(&panic_notifier_list, &paniced);
988 1142
989 /* The Linux bootloader header contains an "e820" memory map: the 1143 /*
990 * Launcher populated the first entry with our memory limit. */ 1144 *The Linux bootloader header contains an "e820" memory map: the
1145 * Launcher populated the first entry with our memory limit.
1146 */
991 e820_add_region(boot_params.e820_map[0].addr, 1147 e820_add_region(boot_params.e820_map[0].addr,
992 boot_params.e820_map[0].size, 1148 boot_params.e820_map[0].size,
993 boot_params.e820_map[0].type); 1149 boot_params.e820_map[0].type);
@@ -996,16 +1152,17 @@ static __init char *lguest_memory_setup(void)
996 return "LGUEST"; 1152 return "LGUEST";
997} 1153}
998 1154
999/* We will eventually use the virtio console device to produce console output, 1155/*
1156 * We will eventually use the virtio console device to produce console output,
1000 * but before that is set up we use LHCALL_NOTIFY on normal memory to produce 1157 * but before that is set up we use LHCALL_NOTIFY on normal memory to produce
1001 * console output. */ 1158 * console output.
1159 */
1002static __init int early_put_chars(u32 vtermno, const char *buf, int count) 1160static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1003{ 1161{
1004 char scratch[17]; 1162 char scratch[17];
1005 unsigned int len = count; 1163 unsigned int len = count;
1006 1164
1007 /* We use a nul-terminated string, so we have to make a copy. Icky, 1165 /* We use a nul-terminated string, so we make a copy. Icky, huh? */
1008 * huh? */
1009 if (len > sizeof(scratch) - 1) 1166 if (len > sizeof(scratch) - 1)
1010 len = sizeof(scratch) - 1; 1167 len = sizeof(scratch) - 1;
1011 scratch[len] = '\0'; 1168 scratch[len] = '\0';
@@ -1016,8 +1173,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1016 return len; 1173 return len;
1017} 1174}
1018 1175
1019/* Rebooting also tells the Host we're finished, but the RESTART flag tells the 1176/*
1020 * Launcher to reboot us. */ 1177 * Rebooting also tells the Host we're finished, but the RESTART flag tells the
1178 * Launcher to reboot us.
1179 */
1021static void lguest_restart(char *reason) 1180static void lguest_restart(char *reason)
1022{ 1181{
1023 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); 1182 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART);
@@ -1044,7 +1203,8 @@ static void lguest_restart(char *reason)
1044 * fit comfortably. 1203 * fit comfortably.
1045 * 1204 *
1046 * First we need assembly templates of each of the patchable Guest operations, 1205 * First we need assembly templates of each of the patchable Guest operations,
1047 * and these are in i386_head.S. */ 1206 * and these are in i386_head.S.
1207 */
1048 1208
1049/*G:060 We construct a table from the assembler templates: */ 1209/*G:060 We construct a table from the assembler templates: */
1050static const struct lguest_insns 1210static const struct lguest_insns
@@ -1055,9 +1215,11 @@ static const struct lguest_insns
1055 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, 1215 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
1056}; 1216};
1057 1217
1058/* Now our patch routine is fairly simple (based on the native one in 1218/*
1219 * Now our patch routine is fairly simple (based on the native one in
1059 * paravirt.c). If we have a replacement, we copy it in and return how much of 1220 * paravirt.c). If we have a replacement, we copy it in and return how much of
1060 * the available space we used. */ 1221 * the available space we used.
1222 */
1061static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, 1223static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
1062 unsigned long addr, unsigned len) 1224 unsigned long addr, unsigned len)
1063{ 1225{
@@ -1069,8 +1231,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
1069 1231
1070 insn_len = lguest_insns[type].end - lguest_insns[type].start; 1232 insn_len = lguest_insns[type].end - lguest_insns[type].start;
1071 1233
1072 /* Similarly if we can't fit replacement (shouldn't happen, but let's 1234 /* Similarly if it can't fit (doesn't happen, but let's be thorough). */
1073 * be thorough). */
1074 if (len < insn_len) 1235 if (len < insn_len)
1075 return paravirt_patch_default(type, clobber, ibuf, addr, len); 1236 return paravirt_patch_default(type, clobber, ibuf, addr, len);
1076 1237
@@ -1079,22 +1240,28 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
1079 return insn_len; 1240 return insn_len;
1080} 1241}
1081 1242
1082/*G:030 Once we get to lguest_init(), we know we're a Guest. The various 1243/*G:029
1244 * Once we get to lguest_init(), we know we're a Guest. The various
1083 * pv_ops structures in the kernel provide points for (almost) every routine we 1245 * pv_ops structures in the kernel provide points for (almost) every routine we
1084 * have to override to avoid privileged instructions. */ 1246 * have to override to avoid privileged instructions.
1247 */
1085__init void lguest_init(void) 1248__init void lguest_init(void)
1086{ 1249{
1087 /* We're under lguest, paravirt is enabled, and we're running at 1250 /* We're under lguest. */
1088 * privilege level 1, not 0 as normal. */
1089 pv_info.name = "lguest"; 1251 pv_info.name = "lguest";
1252 /* Paravirt is enabled. */
1090 pv_info.paravirt_enabled = 1; 1253 pv_info.paravirt_enabled = 1;
1254 /* We're running at privilege level 1, not 0 as normal. */
1091 pv_info.kernel_rpl = 1; 1255 pv_info.kernel_rpl = 1;
1256 /* Everyone except Xen runs with this set. */
1092 pv_info.shared_kernel_pmd = 1; 1257 pv_info.shared_kernel_pmd = 1;
1093 1258
1094 /* We set up all the lguest overrides for sensitive operations. These 1259 /*
1095 * are detailed with the operations themselves. */ 1260 * We set up all the lguest overrides for sensitive operations. These
1261 * are detailed with the operations themselves.
1262 */
1096 1263
1097 /* interrupt-related operations */ 1264 /* Interrupt-related operations */
1098 pv_irq_ops.init_IRQ = lguest_init_IRQ; 1265 pv_irq_ops.init_IRQ = lguest_init_IRQ;
1099 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); 1266 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
1100 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); 1267 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
@@ -1102,11 +1269,11 @@ __init void lguest_init(void)
1102 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); 1269 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
1103 pv_irq_ops.safe_halt = lguest_safe_halt; 1270 pv_irq_ops.safe_halt = lguest_safe_halt;
1104 1271
1105 /* init-time operations */ 1272 /* Setup operations */
1106 pv_init_ops.memory_setup = lguest_memory_setup; 1273 pv_init_ops.memory_setup = lguest_memory_setup;
1107 pv_init_ops.patch = lguest_patch; 1274 pv_init_ops.patch = lguest_patch;
1108 1275
1109 /* Intercepts of various cpu instructions */ 1276 /* Intercepts of various CPU instructions */
1110 pv_cpu_ops.load_gdt = lguest_load_gdt; 1277 pv_cpu_ops.load_gdt = lguest_load_gdt;
1111 pv_cpu_ops.cpuid = lguest_cpuid; 1278 pv_cpu_ops.cpuid = lguest_cpuid;
1112 pv_cpu_ops.load_idt = lguest_load_idt; 1279 pv_cpu_ops.load_idt = lguest_load_idt;
@@ -1127,7 +1294,7 @@ __init void lguest_init(void)
1127 pv_cpu_ops.start_context_switch = paravirt_start_context_switch; 1294 pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
1128 pv_cpu_ops.end_context_switch = lguest_end_context_switch; 1295 pv_cpu_ops.end_context_switch = lguest_end_context_switch;
1129 1296
1130 /* pagetable management */ 1297 /* Pagetable management */
1131 pv_mmu_ops.write_cr3 = lguest_write_cr3; 1298 pv_mmu_ops.write_cr3 = lguest_write_cr3;
1132 pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; 1299 pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
1133 pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; 1300 pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
@@ -1149,54 +1316,71 @@ __init void lguest_init(void)
1149 pv_mmu_ops.pte_update_defer = lguest_pte_update; 1316 pv_mmu_ops.pte_update_defer = lguest_pte_update;
1150 1317
1151#ifdef CONFIG_X86_LOCAL_APIC 1318#ifdef CONFIG_X86_LOCAL_APIC
1152 /* apic read/write intercepts */ 1319 /* APIC read/write intercepts */
1153 set_lguest_basic_apic_ops(); 1320 set_lguest_basic_apic_ops();
1154#endif 1321#endif
1155 1322
1156 /* time operations */ 1323 /* Time operations */
1157 pv_time_ops.get_wallclock = lguest_get_wallclock; 1324 pv_time_ops.get_wallclock = lguest_get_wallclock;
1158 pv_time_ops.time_init = lguest_time_init; 1325 pv_time_ops.time_init = lguest_time_init;
1159 pv_time_ops.get_tsc_khz = lguest_tsc_khz; 1326 pv_time_ops.get_tsc_khz = lguest_tsc_khz;
1160 1327
1161 /* Now is a good time to look at the implementations of these functions 1328 /*
1162 * before returning to the rest of lguest_init(). */ 1329 * Now is a good time to look at the implementations of these functions
1330 * before returning to the rest of lguest_init().
1331 */
1163 1332
1164 /*G:070 Now we've seen all the paravirt_ops, we return to 1333 /*G:070
1334 * Now we've seen all the paravirt_ops, we return to
1165 * lguest_init() where the rest of the fairly chaotic boot setup 1335 * lguest_init() where the rest of the fairly chaotic boot setup
1166 * occurs. */ 1336 * occurs.
1337 */
1167 1338
1168 /* The stack protector is a weird thing where gcc places a canary 1339 /*
1340 * The stack protector is a weird thing where gcc places a canary
1169 * value on the stack and then checks it on return. This file is 1341 * value on the stack and then checks it on return. This file is
1170 * compiled with -fno-stack-protector it, so we got this far without 1342 * compiled with -fno-stack-protector it, so we got this far without
1171 * problems. The value of the canary is kept at offset 20 from the 1343 * problems. The value of the canary is kept at offset 20 from the
1172 * %gs register, so we need to set that up before calling C functions 1344 * %gs register, so we need to set that up before calling C functions
1173 * in other files. */ 1345 * in other files.
1346 */
1174 setup_stack_canary_segment(0); 1347 setup_stack_canary_segment(0);
1175 /* We could just call load_stack_canary_segment(), but we might as 1348
1176 * call switch_to_new_gdt() which loads the whole table and sets up 1349 /*
1177 * the per-cpu segment descriptor register %fs as well. */ 1350 * We could just call load_stack_canary_segment(), but we might as well
1351 * call switch_to_new_gdt() which loads the whole table and sets up the
1352 * per-cpu segment descriptor register %fs as well.
1353 */
1178 switch_to_new_gdt(0); 1354 switch_to_new_gdt(0);
1179 1355
1180 /* As described in head_32.S, we map the first 128M of memory. */ 1356 /* We actually boot with all memory mapped, but let's say 128MB. */
1181 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1357 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1182 1358
1183 /* The Host<->Guest Switcher lives at the top of our address space, and 1359 /*
1360 * The Host<->Guest Switcher lives at the top of our address space, and
1184 * the Host told us how big it is when we made LGUEST_INIT hypercall: 1361 * the Host told us how big it is when we made LGUEST_INIT hypercall:
1185 * it put the answer in lguest_data.reserve_mem */ 1362 * it put the answer in lguest_data.reserve_mem
1363 */
1186 reserve_top_address(lguest_data.reserve_mem); 1364 reserve_top_address(lguest_data.reserve_mem);
1187 1365
1188 /* If we don't initialize the lock dependency checker now, it crashes 1366 /*
1189 * paravirt_disable_iospace. */ 1367 * If we don't initialize the lock dependency checker now, it crashes
1368 * paravirt_disable_iospace.
1369 */
1190 lockdep_init(); 1370 lockdep_init();
1191 1371
1192 /* The IDE code spends about 3 seconds probing for disks: if we reserve 1372 /*
1373 * The IDE code spends about 3 seconds probing for disks: if we reserve
1193 * all the I/O ports up front it can't get them and so doesn't probe. 1374 * all the I/O ports up front it can't get them and so doesn't probe.
1194 * Other device drivers are similar (but less severe). This cuts the 1375 * Other device drivers are similar (but less severe). This cuts the
1195 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ 1376 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds.
1377 */
1196 paravirt_disable_iospace(); 1378 paravirt_disable_iospace();
1197 1379
1198 /* This is messy CPU setup stuff which the native boot code does before 1380 /*
1199 * start_kernel, so we have to do, too: */ 1381 * This is messy CPU setup stuff which the native boot code does before
1382 * start_kernel, so we have to do, too:
1383 */
1200 cpu_detect(&new_cpu_data); 1384 cpu_detect(&new_cpu_data);
1201 /* head.S usually sets up the first capability word, so do it here. */ 1385 /* head.S usually sets up the first capability word, so do it here. */
1202 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1386 new_cpu_data.x86_capability[0] = cpuid_edx(1);
@@ -1213,22 +1397,28 @@ __init void lguest_init(void)
1213 acpi_ht = 0; 1397 acpi_ht = 0;
1214#endif 1398#endif
1215 1399
1216 /* We set the preferred console to "hvc". This is the "hypervisor 1400 /*
1401 * We set the preferred console to "hvc". This is the "hypervisor
1217 * virtual console" driver written by the PowerPC people, which we also 1402 * virtual console" driver written by the PowerPC people, which we also
1218 * adapted for lguest's use. */ 1403 * adapted for lguest's use.
1404 */
1219 add_preferred_console("hvc", 0, NULL); 1405 add_preferred_console("hvc", 0, NULL);
1220 1406
1221 /* Register our very early console. */ 1407 /* Register our very early console. */
1222 virtio_cons_early_init(early_put_chars); 1408 virtio_cons_early_init(early_put_chars);
1223 1409
1224 /* Last of all, we set the power management poweroff hook to point to 1410 /*
1411 * Last of all, we set the power management poweroff hook to point to
1225 * the Guest routine to power off, and the reboot hook to our restart 1412 * the Guest routine to power off, and the reboot hook to our restart
1226 * routine. */ 1413 * routine.
1414 */
1227 pm_power_off = lguest_power_off; 1415 pm_power_off = lguest_power_off;
1228 machine_ops.restart = lguest_restart; 1416 machine_ops.restart = lguest_restart;
1229 1417
1230 /* Now we're set up, call i386_start_kernel() in head32.c and we proceed 1418 /*
1231 * to boot as normal. It never returns. */ 1419 * Now we're set up, call i386_start_kernel() in head32.c and we proceed
1420 * to boot as normal. It never returns.
1421 */
1232 i386_start_kernel(); 1422 i386_start_kernel();
1233} 1423}
1234/* 1424/*
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index a9c8cfe61cd4..27eac0faee48 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -5,7 +5,8 @@
5#include <asm/thread_info.h> 5#include <asm/thread_info.h>
6#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
7 7
8/*G:020 Our story starts with the kernel booting into startup_32 in 8/*G:020
9 * Our story starts with the kernel booting into startup_32 in
9 * arch/x86/kernel/head_32.S. It expects a boot header, which is created by 10 * arch/x86/kernel/head_32.S. It expects a boot header, which is created by
10 * the bootloader (the Launcher in our case). 11 * the bootloader (the Launcher in our case).
11 * 12 *
@@ -21,11 +22,14 @@
21 * data without remembering to subtract __PAGE_OFFSET! 22 * data without remembering to subtract __PAGE_OFFSET!
22 * 23 *
23 * The .section line puts this code in .init.text so it will be discarded after 24 * The .section line puts this code in .init.text so it will be discarded after
24 * boot. */ 25 * boot.
26 */
25.section .init.text, "ax", @progbits 27.section .init.text, "ax", @progbits
26ENTRY(lguest_entry) 28ENTRY(lguest_entry)
27 /* We make the "initialization" hypercall now to tell the Host about 29 /*
28 * us, and also find out where it put our page tables. */ 30 * We make the "initialization" hypercall now to tell the Host about
31 * us, and also find out where it put our page tables.
32 */
29 movl $LHCALL_LGUEST_INIT, %eax 33 movl $LHCALL_LGUEST_INIT, %eax
30 movl $lguest_data - __PAGE_OFFSET, %ebx 34 movl $lguest_data - __PAGE_OFFSET, %ebx
31 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 35 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
@@ -33,13 +37,14 @@ ENTRY(lguest_entry)
33 /* Set up the initial stack so we can run C code. */ 37 /* Set up the initial stack so we can run C code. */
34 movl $(init_thread_union+THREAD_SIZE),%esp 38 movl $(init_thread_union+THREAD_SIZE),%esp
35 39
36 /* Jumps are relative, and we're running __PAGE_OFFSET too low at the 40 /* Jumps are relative: we're running __PAGE_OFFSET too low. */
37 * moment. */
38 jmp lguest_init+__PAGE_OFFSET 41 jmp lguest_init+__PAGE_OFFSET
39 42
40/*G:055 We create a macro which puts the assembler code between lgstart_ and 43/*G:055
41 * lgend_ markers. These templates are put in the .text section: they can't be 44 * We create a macro which puts the assembler code between lgstart_ and lgend_
42 * discarded after boot as we may need to patch modules, too. */ 45 * markers. These templates are put in the .text section: they can't be
46 * discarded after boot as we may need to patch modules, too.
47 */
43.text 48.text
44#define LGUEST_PATCH(name, insns...) \ 49#define LGUEST_PATCH(name, insns...) \
45 lgstart_##name: insns; lgend_##name:; \ 50 lgstart_##name: insns; lgend_##name:; \
@@ -48,83 +53,103 @@ ENTRY(lguest_entry)
48LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) 53LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
49LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) 54LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
50 55
51/*G:033 But using those wrappers is inefficient (we'll see why that doesn't 56/*G:033
52 * matter for save_fl and irq_disable later). If we write our routines 57 * But using those wrappers is inefficient (we'll see why that doesn't matter
53 * carefully in assembler, we can avoid clobbering any registers and avoid 58 * for save_fl and irq_disable later). If we write our routines carefully in
54 * jumping through the wrapper functions. 59 * assembler, we can avoid clobbering any registers and avoid jumping through
60 * the wrapper functions.
55 * 61 *
56 * I skipped over our first piece of assembler, but this one is worth studying 62 * I skipped over our first piece of assembler, but this one is worth studying
57 * in a bit more detail so I'll describe in easy stages. First, the routine 63 * in a bit more detail so I'll describe in easy stages. First, the routine to
58 * to enable interrupts: */ 64 * enable interrupts:
65 */
59ENTRY(lg_irq_enable) 66ENTRY(lg_irq_enable)
60 /* The reverse of irq_disable, this sets lguest_data.irq_enabled to 67 /*
61 * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ 68 * The reverse of irq_disable, this sets lguest_data.irq_enabled to
69 * X86_EFLAGS_IF (ie. "Interrupts enabled").
70 */
62 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled 71 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
63 /* But now we need to check if the Host wants to know: there might have 72 /*
73 * But now we need to check if the Host wants to know: there might have
64 * been interrupts waiting to be delivered, in which case it will have 74 * been interrupts waiting to be delivered, in which case it will have
65 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we 75 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
66 * jump to send_interrupts, otherwise we're done. */ 76 * jump to send_interrupts, otherwise we're done.
77 */
67 testl $0, lguest_data+LGUEST_DATA_irq_pending 78 testl $0, lguest_data+LGUEST_DATA_irq_pending
68 jnz send_interrupts 79 jnz send_interrupts
69 /* One cool thing about x86 is that you can do many things without using 80 /*
81 * One cool thing about x86 is that you can do many things without using
70 * a register. In this case, the normal path hasn't needed to save or 82 * a register. In this case, the normal path hasn't needed to save or
71 * restore any registers at all! */ 83 * restore any registers at all!
84 */
72 ret 85 ret
73send_interrupts: 86send_interrupts:
74 /* OK, now we need a register: eax is used for the hypercall number, 87 /*
88 * OK, now we need a register: eax is used for the hypercall number,
75 * which is LHCALL_SEND_INTERRUPTS. 89 * which is LHCALL_SEND_INTERRUPTS.
76 * 90 *
77 * We used not to bother with this pending detection at all, which was 91 * We used not to bother with this pending detection at all, which was
78 * much simpler. Sooner or later the Host would realize it had to 92 * much simpler. Sooner or later the Host would realize it had to
79 * send us an interrupt. But that turns out to make performance 7 93 * send us an interrupt. But that turns out to make performance 7
80 * times worse on a simple tcp benchmark. So now we do this the hard 94 * times worse on a simple tcp benchmark. So now we do this the hard
81 * way. */ 95 * way.
96 */
82 pushl %eax 97 pushl %eax
83 movl $LHCALL_SEND_INTERRUPTS, %eax 98 movl $LHCALL_SEND_INTERRUPTS, %eax
84 /* This is a vmcall instruction (same thing that KVM uses). Older 99 /*
100 * This is a vmcall instruction (same thing that KVM uses). Older
85 * assembler versions might not know the "vmcall" instruction, so we 101 * assembler versions might not know the "vmcall" instruction, so we
86 * create one manually here. */ 102 * create one manually here.
103 */
87 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 104 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
105 /* Put eax back the way we found it. */
88 popl %eax 106 popl %eax
89 ret 107 ret
90 108
91/* Finally, the "popf" or "restore flags" routine. The %eax register holds the 109/*
110 * Finally, the "popf" or "restore flags" routine. The %eax register holds the
92 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're 111 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
93 * enabling interrupts again, if it's 0 we're leaving them off. */ 112 * enabling interrupts again, if it's 0 we're leaving them off.
113 */
94ENTRY(lg_restore_fl) 114ENTRY(lg_restore_fl)
95 /* This is just "lguest_data.irq_enabled = flags;" */ 115 /* This is just "lguest_data.irq_enabled = flags;" */
96 movl %eax, lguest_data+LGUEST_DATA_irq_enabled 116 movl %eax, lguest_data+LGUEST_DATA_irq_enabled
97 /* Now, if the %eax value has enabled interrupts and 117 /*
118 * Now, if the %eax value has enabled interrupts and
98 * lguest_data.irq_pending is set, we want to tell the Host so it can 119 * lguest_data.irq_pending is set, we want to tell the Host so it can
99 * deliver any outstanding interrupts. Fortunately, both values will 120 * deliver any outstanding interrupts. Fortunately, both values will
100 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" 121 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
101 * instruction will AND them together for us. If both are set, we 122 * instruction will AND them together for us. If both are set, we
102 * jump to send_interrupts. */ 123 * jump to send_interrupts.
124 */
103 testl lguest_data+LGUEST_DATA_irq_pending, %eax 125 testl lguest_data+LGUEST_DATA_irq_pending, %eax
104 jnz send_interrupts 126 jnz send_interrupts
105 /* Again, the normal path has used no extra registers. Clever, huh? */ 127 /* Again, the normal path has used no extra registers. Clever, huh? */
106 ret 128 ret
129/*:*/
107 130
108/* These demark the EIP range where host should never deliver interrupts. */ 131/* These demark the EIP range where host should never deliver interrupts. */
109.global lguest_noirq_start 132.global lguest_noirq_start
110.global lguest_noirq_end 133.global lguest_noirq_end
111 134
112/*M:004 When the Host reflects a trap or injects an interrupt into the Guest, 135/*M:004
113 * it sets the eflags interrupt bit on the stack based on 136 * When the Host reflects a trap or injects an interrupt into the Guest, it
114 * lguest_data.irq_enabled, so the Guest iret logic does the right thing when 137 * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
115 * restoring it. However, when the Host sets the Guest up for direct traps, 138 * so the Guest iret logic does the right thing when restoring it. However,
116 * such as system calls, the processor is the one to push eflags onto the 139 * when the Host sets the Guest up for direct traps, such as system calls, the
117 * stack, and the interrupt bit will be 1 (in reality, interrupts are always 140 * processor is the one to push eflags onto the stack, and the interrupt bit
118 * enabled in the Guest). 141 * will be 1 (in reality, interrupts are always enabled in the Guest).
119 * 142 *
120 * This turns out to be harmless: the only trap which should happen under Linux 143 * This turns out to be harmless: the only trap which should happen under Linux
121 * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc 144 * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
122 * regions), which has to be reflected through the Host anyway. If another 145 * regions), which has to be reflected through the Host anyway. If another
123 * trap *does* go off when interrupts are disabled, the Guest will panic, and 146 * trap *does* go off when interrupts are disabled, the Guest will panic, and
124 * we'll never get to this iret! :*/ 147 * we'll never get to this iret!
148:*/
125 149
126/*G:045 There is one final paravirt_op that the Guest implements, and glancing 150/*G:045
127 * at it you can see why I left it to last. It's *cool*! It's in *assembler*! 151 * There is one final paravirt_op that the Guest implements, and glancing at it
152 * you can see why I left it to last. It's *cool*! It's in *assembler*!
128 * 153 *
129 * The "iret" instruction is used to return from an interrupt or trap. The 154 * The "iret" instruction is used to return from an interrupt or trap. The
130 * stack looks like this: 155 * stack looks like this:
@@ -148,15 +173,18 @@ ENTRY(lg_restore_fl)
148 * return to userspace or wherever. Our solution to this is to surround the 173 * return to userspace or wherever. Our solution to this is to surround the
149 * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the 174 * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the
150 * Host that it is *never* to interrupt us there, even if interrupts seem to be 175 * Host that it is *never* to interrupt us there, even if interrupts seem to be
151 * enabled. */ 176 * enabled.
177 */
152ENTRY(lguest_iret) 178ENTRY(lguest_iret)
153 pushl %eax 179 pushl %eax
154 movl 12(%esp), %eax 180 movl 12(%esp), %eax
155lguest_noirq_start: 181lguest_noirq_start:
156 /* Note the %ss: segment prefix here. Normal data accesses use the 182 /*
183 * Note the %ss: segment prefix here. Normal data accesses use the
157 * "ds" segment, but that will have already been restored for whatever 184 * "ds" segment, but that will have already been restored for whatever
158 * we're returning to (such as userspace): we can't trust it. The %ss: 185 * we're returning to (such as userspace): we can't trust it. The %ss:
159 * prefix makes sure we use the stack segment, which is still valid. */ 186 * prefix makes sure we use the stack segment, which is still valid.
187 */
160 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled 188 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled
161 popl %eax 189 popl %eax
162 iret 190 iret
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f9d35632666b..07c31899c9c2 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -10,6 +10,7 @@ lib-y += usercopy_$(BITS).o getuser.o putuser.o
10lib-y += memcpy_$(BITS).o 10lib-y += memcpy_$(BITS).o
11 11
12ifeq ($(CONFIG_X86_32),y) 12ifeq ($(CONFIG_X86_32),y)
13 obj-y += atomic64_32.o
13 lib-y += checksum_32.o 14 lib-y += checksum_32.o
14 lib-y += strstr_32.o 15 lib-y += strstr_32.o
15 lib-y += semaphore_32.o string_32.o 16 lib-y += semaphore_32.o string_32.o
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
new file mode 100644
index 000000000000..824fa0be55a3
--- /dev/null
+++ b/arch/x86/lib/atomic64_32.c
@@ -0,0 +1,230 @@
1#include <linux/compiler.h>
2#include <linux/module.h>
3#include <linux/types.h>
4
5#include <asm/processor.h>
6#include <asm/cmpxchg.h>
7#include <asm/atomic.h>
8
9static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new)
10{
11 u32 low = new;
12 u32 high = new >> 32;
13
14 asm volatile(
15 LOCK_PREFIX "cmpxchg8b %1\n"
16 : "+A" (old), "+m" (*ptr)
17 : "b" (low), "c" (high)
18 );
19 return old;
20}
21
22u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val)
23{
24 return cmpxchg8b(&ptr->counter, old_val, new_val);
25}
26EXPORT_SYMBOL(atomic64_cmpxchg);
27
28/**
29 * atomic64_xchg - xchg atomic64 variable
30 * @ptr: pointer to type atomic64_t
31 * @new_val: value to assign
32 *
33 * Atomically xchgs the value of @ptr to @new_val and returns
34 * the old value.
35 */
36u64 atomic64_xchg(atomic64_t *ptr, u64 new_val)
37{
38 /*
39 * Try first with a (possibly incorrect) assumption about
40 * what we have there. We'll do two loops most likely,
41 * but we'll get an ownership MESI transaction straight away
42 * instead of a read transaction followed by a
43 * flush-for-ownership transaction:
44 */
45 u64 old_val, real_val = 0;
46
47 do {
48 old_val = real_val;
49
50 real_val = atomic64_cmpxchg(ptr, old_val, new_val);
51
52 } while (real_val != old_val);
53
54 return old_val;
55}
56EXPORT_SYMBOL(atomic64_xchg);
57
58/**
59 * atomic64_set - set atomic64 variable
60 * @ptr: pointer to type atomic64_t
61 * @new_val: value to assign
62 *
63 * Atomically sets the value of @ptr to @new_val.
64 */
65void atomic64_set(atomic64_t *ptr, u64 new_val)
66{
67 atomic64_xchg(ptr, new_val);
68}
69EXPORT_SYMBOL(atomic64_set);
70
71/**
72EXPORT_SYMBOL(atomic64_read);
73 * atomic64_add_return - add and return
74 * @delta: integer value to add
75 * @ptr: pointer to type atomic64_t
76 *
77 * Atomically adds @delta to @ptr and returns @delta + *@ptr
78 */
79noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr)
80{
81 /*
82 * Try first with a (possibly incorrect) assumption about
83 * what we have there. We'll do two loops most likely,
84 * but we'll get an ownership MESI transaction straight away
85 * instead of a read transaction followed by a
86 * flush-for-ownership transaction:
87 */
88 u64 old_val, new_val, real_val = 0;
89
90 do {
91 old_val = real_val;
92 new_val = old_val + delta;
93
94 real_val = atomic64_cmpxchg(ptr, old_val, new_val);
95
96 } while (real_val != old_val);
97
98 return new_val;
99}
100EXPORT_SYMBOL(atomic64_add_return);
101
102u64 atomic64_sub_return(u64 delta, atomic64_t *ptr)
103{
104 return atomic64_add_return(-delta, ptr);
105}
106EXPORT_SYMBOL(atomic64_sub_return);
107
108u64 atomic64_inc_return(atomic64_t *ptr)
109{
110 return atomic64_add_return(1, ptr);
111}
112EXPORT_SYMBOL(atomic64_inc_return);
113
114u64 atomic64_dec_return(atomic64_t *ptr)
115{
116 return atomic64_sub_return(1, ptr);
117}
118EXPORT_SYMBOL(atomic64_dec_return);
119
120/**
121 * atomic64_add - add integer to atomic64 variable
122 * @delta: integer value to add
123 * @ptr: pointer to type atomic64_t
124 *
125 * Atomically adds @delta to @ptr.
126 */
127void atomic64_add(u64 delta, atomic64_t *ptr)
128{
129 atomic64_add_return(delta, ptr);
130}
131EXPORT_SYMBOL(atomic64_add);
132
133/**
134 * atomic64_sub - subtract the atomic64 variable
135 * @delta: integer value to subtract
136 * @ptr: pointer to type atomic64_t
137 *
138 * Atomically subtracts @delta from @ptr.
139 */
140void atomic64_sub(u64 delta, atomic64_t *ptr)
141{
142 atomic64_add(-delta, ptr);
143}
144EXPORT_SYMBOL(atomic64_sub);
145
146/**
147 * atomic64_sub_and_test - subtract value from variable and test result
148 * @delta: integer value to subtract
149 * @ptr: pointer to type atomic64_t
150 *
151 * Atomically subtracts @delta from @ptr and returns
152 * true if the result is zero, or false for all
153 * other cases.
154 */
155int atomic64_sub_and_test(u64 delta, atomic64_t *ptr)
156{
157 u64 new_val = atomic64_sub_return(delta, ptr);
158
159 return new_val == 0;
160}
161EXPORT_SYMBOL(atomic64_sub_and_test);
162
163/**
164 * atomic64_inc - increment atomic64 variable
165 * @ptr: pointer to type atomic64_t
166 *
167 * Atomically increments @ptr by 1.
168 */
169void atomic64_inc(atomic64_t *ptr)
170{
171 atomic64_add(1, ptr);
172}
173EXPORT_SYMBOL(atomic64_inc);
174
175/**
176 * atomic64_dec - decrement atomic64 variable
177 * @ptr: pointer to type atomic64_t
178 *
179 * Atomically decrements @ptr by 1.
180 */
181void atomic64_dec(atomic64_t *ptr)
182{
183 atomic64_sub(1, ptr);
184}
185EXPORT_SYMBOL(atomic64_dec);
186
187/**
188 * atomic64_dec_and_test - decrement and test
189 * @ptr: pointer to type atomic64_t
190 *
191 * Atomically decrements @ptr by 1 and
192 * returns true if the result is 0, or false for all other
193 * cases.
194 */
195int atomic64_dec_and_test(atomic64_t *ptr)
196{
197 return atomic64_sub_and_test(1, ptr);
198}
199EXPORT_SYMBOL(atomic64_dec_and_test);
200
201/**
202 * atomic64_inc_and_test - increment and test
203 * @ptr: pointer to type atomic64_t
204 *
205 * Atomically increments @ptr by 1
206 * and returns true if the result is zero, or false for all
207 * other cases.
208 */
209int atomic64_inc_and_test(atomic64_t *ptr)
210{
211 return atomic64_sub_and_test(-1, ptr);
212}
213EXPORT_SYMBOL(atomic64_inc_and_test);
214
215/**
216 * atomic64_add_negative - add and test if negative
217 * @delta: integer value to add
218 * @ptr: pointer to type atomic64_t
219 *
220 * Atomically adds @delta to @ptr and returns true
221 * if the result is negative, or false when
222 * result is greater than or equal to zero.
223 */
224int atomic64_add_negative(u64 delta, atomic64_t *ptr)
225{
226 s64 new_val = atomic64_add_return(delta, ptr);
227
228 return new_val < 0;
229}
230EXPORT_SYMBOL(atomic64_add_negative);
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 9a10a78bb4a4..ebeafcce04a9 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -5,15 +5,14 @@
5 * Zero a page. 5 * Zero a page.
6 * rdi page 6 * rdi page
7 */ 7 */
8 ALIGN 8ENTRY(clear_page_c)
9clear_page_c:
10 CFI_STARTPROC 9 CFI_STARTPROC
11 movl $4096/8,%ecx 10 movl $4096/8,%ecx
12 xorl %eax,%eax 11 xorl %eax,%eax
13 rep stosq 12 rep stosq
14 ret 13 ret
15 CFI_ENDPROC 14 CFI_ENDPROC
16ENDPROC(clear_page) 15ENDPROC(clear_page_c)
17 16
18ENTRY(clear_page) 17ENTRY(clear_page)
19 CFI_STARTPROC 18 CFI_STARTPROC
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index f118c110af32..6ba0f7bb85ea 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -75,6 +75,7 @@ ENTRY(copy_to_user)
75 jae bad_to_user 75 jae bad_to_user
76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
77 CFI_ENDPROC 77 CFI_ENDPROC
78ENDPROC(copy_to_user)
78 79
79/* Standard copy_from_user with segment limit checking */ 80/* Standard copy_from_user with segment limit checking */
80ENTRY(copy_from_user) 81ENTRY(copy_from_user)
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 1440b9c0547e..caa24aca8115 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -89,16 +89,13 @@ void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
89 rv.msrs = msrs; 89 rv.msrs = msrs;
90 rv.msr_no = msr_no; 90 rv.msr_no = msr_no;
91 91
92 preempt_disable(); 92 this_cpu = get_cpu();
93 /* 93
94 * FIXME: handle the CPU we're executing on separately for now until 94 if (cpumask_test_cpu(this_cpu, mask))
95 * smp_call_function_many has been fixed to not skip it. 95 __rdmsr_on_cpu(&rv);
96 */
97 this_cpu = raw_smp_processor_id();
98 smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1);
99 96
100 smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1); 97 smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
101 preempt_enable(); 98 put_cpu();
102} 99}
103EXPORT_SYMBOL(rdmsr_on_cpus); 100EXPORT_SYMBOL(rdmsr_on_cpus);
104 101
@@ -121,16 +118,13 @@ void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
121 rv.msrs = msrs; 118 rv.msrs = msrs;
122 rv.msr_no = msr_no; 119 rv.msr_no = msr_no;
123 120
124 preempt_disable(); 121 this_cpu = get_cpu();
125 /* 122
126 * FIXME: handle the CPU we're executing on separately for now until 123 if (cpumask_test_cpu(this_cpu, mask))
127 * smp_call_function_many has been fixed to not skip it. 124 __wrmsr_on_cpu(&rv);
128 */
129 this_cpu = raw_smp_processor_id();
130 smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1);
131 125
132 smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1); 126 smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
133 preempt_enable(); 127 put_cpu();
134} 128}
135EXPORT_SYMBOL(wrmsr_on_cpus); 129EXPORT_SYMBOL(wrmsr_on_cpus);
136 130
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 7c8ca91bb9ec..1f118d462acc 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -751,7 +751,7 @@ survive:
751 751
752 if (retval == -ENOMEM && is_global_init(current)) { 752 if (retval == -ENOMEM && is_global_init(current)) {
753 up_read(&current->mm->mmap_sem); 753 up_read(&current->mm->mmap_sem);
754 congestion_wait(WRITE, HZ/50); 754 congestion_wait(BLK_RW_ASYNC, HZ/50);
755 goto survive; 755 goto survive;
756 } 756 }
757 757
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 78a5fff857be..bfae139182ff 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -426,10 +426,11 @@ static noinline int vmalloc_fault(unsigned long address)
426} 426}
427 427
428static const char errata93_warning[] = 428static const char errata93_warning[] =
429KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" 429KERN_ERR
430KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" 430"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
431KERN_ERR "******* Please consider a BIOS update.\n" 431"******* Working around it, but it may cause SEGVs or burn power.\n"
432KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; 432"******* Please consider a BIOS update.\n"
433"******* Disabling USB legacy in the BIOS may also help.\n";
433 434
434/* 435/*
435 * No vm86 mode in 64-bit mode: 436 * No vm86 mode in 64-bit mode:
@@ -696,7 +697,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
696 if (!printk_ratelimit()) 697 if (!printk_ratelimit())
697 return; 698 return;
698 699
699 printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", 700 printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
700 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 701 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
701 tsk->comm, task_pid_nr(tsk), address, 702 tsk->comm, task_pid_nr(tsk), address,
702 (void *)regs->ip, (void *)regs->sp, error_code); 703 (void *)regs->ip, (void *)regs->sp, error_code);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 58f621e81919..2112ed55e7ea 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -103,6 +103,7 @@ EXPORT_SYMBOL(kmap);
103EXPORT_SYMBOL(kunmap); 103EXPORT_SYMBOL(kunmap);
104EXPORT_SYMBOL(kmap_atomic); 104EXPORT_SYMBOL(kmap_atomic);
105EXPORT_SYMBOL(kunmap_atomic); 105EXPORT_SYMBOL(kunmap_atomic);
106EXPORT_SYMBOL(kmap_atomic_prot);
106 107
107void __init set_highmem_pages_init(void) 108void __init set_highmem_pages_init(void)
108{ 109{
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 47ce9a2ce5e7..0607119cef94 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -12,6 +12,7 @@
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/tlbflush.h> 13#include <asm/tlbflush.h>
14#include <asm/tlb.h> 14#include <asm/tlb.h>
15#include <asm/proto.h>
15 16
16DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 17DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
17 18
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index b177652251a4..6176fe8f29e0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -598,8 +598,15 @@ void __init paging_init(void)
598 598
599 sparse_memory_present_with_active_regions(MAX_NUMNODES); 599 sparse_memory_present_with_active_regions(MAX_NUMNODES);
600 sparse_init(); 600 sparse_init();
601 /* clear the default setting with node 0 */ 601
602 nodes_clear(node_states[N_NORMAL_MEMORY]); 602 /*
603 * clear the default setting with node 0
604 * note: don't use nodes_clear here, that is really clearing when
605 * numa support is not compiled in, and later node_set_state
606 * will not set it back.
607 */
608 node_clear_state(0, N_NORMAL_MEMORY);
609
603 free_area_init_nodes(max_zone_pfns); 610 free_area_init_nodes(max_zone_pfns);
604} 611}
605 612
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 1b734d7a8966..7e600c1962db 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -591,9 +591,12 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
591 unsigned int level; 591 unsigned int level;
592 pte_t *kpte, old_pte; 592 pte_t *kpte, old_pte;
593 593
594 if (cpa->flags & CPA_PAGES_ARRAY) 594 if (cpa->flags & CPA_PAGES_ARRAY) {
595 address = (unsigned long)page_address(cpa->pages[cpa->curpage]); 595 struct page *page = cpa->pages[cpa->curpage];
596 else if (cpa->flags & CPA_ARRAY) 596 if (unlikely(PageHighMem(page)))
597 return 0;
598 address = (unsigned long)page_address(page);
599 } else if (cpa->flags & CPA_ARRAY)
597 address = cpa->vaddr[cpa->curpage]; 600 address = cpa->vaddr[cpa->curpage];
598 else 601 else
599 address = *cpa->vaddr; 602 address = *cpa->vaddr;
@@ -697,9 +700,12 @@ static int cpa_process_alias(struct cpa_data *cpa)
697 * No need to redo, when the primary call touched the direct 700 * No need to redo, when the primary call touched the direct
698 * mapping already: 701 * mapping already:
699 */ 702 */
700 if (cpa->flags & CPA_PAGES_ARRAY) 703 if (cpa->flags & CPA_PAGES_ARRAY) {
701 vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]); 704 struct page *page = cpa->pages[cpa->curpage];
702 else if (cpa->flags & CPA_ARRAY) 705 if (unlikely(PageHighMem(page)))
706 return 0;
707 vaddr = (unsigned long)page_address(page);
708 } else if (cpa->flags & CPA_ARRAY)
703 vaddr = cpa->vaddr[cpa->curpage]; 709 vaddr = cpa->vaddr[cpa->curpage];
704 else 710 else
705 vaddr = *cpa->vaddr; 711 vaddr = *cpa->vaddr;
@@ -997,12 +1003,15 @@ EXPORT_SYMBOL(set_memory_array_uc);
997int _set_memory_wc(unsigned long addr, int numpages) 1003int _set_memory_wc(unsigned long addr, int numpages)
998{ 1004{
999 int ret; 1005 int ret;
1006 unsigned long addr_copy = addr;
1007
1000 ret = change_page_attr_set(&addr, numpages, 1008 ret = change_page_attr_set(&addr, numpages,
1001 __pgprot(_PAGE_CACHE_UC_MINUS), 0); 1009 __pgprot(_PAGE_CACHE_UC_MINUS), 0);
1002
1003 if (!ret) { 1010 if (!ret) {
1004 ret = change_page_attr_set(&addr, numpages, 1011 ret = change_page_attr_set_clr(&addr_copy, numpages,
1005 __pgprot(_PAGE_CACHE_WC), 0); 1012 __pgprot(_PAGE_CACHE_WC),
1013 __pgprot(_PAGE_CACHE_MASK),
1014 0, 0, NULL);
1006 } 1015 }
1007 return ret; 1016 return ret;
1008} 1017}
@@ -1119,7 +1128,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray)
1119 int free_idx; 1128 int free_idx;
1120 1129
1121 for (i = 0; i < addrinarray; i++) { 1130 for (i = 0; i < addrinarray; i++) {
1122 start = (unsigned long)page_address(pages[i]); 1131 if (PageHighMem(pages[i]))
1132 continue;
1133 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1123 end = start + PAGE_SIZE; 1134 end = start + PAGE_SIZE;
1124 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) 1135 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
1125 goto err_out; 1136 goto err_out;
@@ -1132,7 +1143,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray)
1132err_out: 1143err_out:
1133 free_idx = i; 1144 free_idx = i;
1134 for (i = 0; i < free_idx; i++) { 1145 for (i = 0; i < free_idx; i++) {
1135 start = (unsigned long)page_address(pages[i]); 1146 if (PageHighMem(pages[i]))
1147 continue;
1148 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1136 end = start + PAGE_SIZE; 1149 end = start + PAGE_SIZE;
1137 free_memtype(start, end); 1150 free_memtype(start, end);
1138 } 1151 }
@@ -1161,7 +1174,9 @@ int set_pages_array_wb(struct page **pages, int addrinarray)
1161 return retval; 1174 return retval;
1162 1175
1163 for (i = 0; i < addrinarray; i++) { 1176 for (i = 0; i < addrinarray; i++) {
1164 start = (unsigned long)page_address(pages[i]); 1177 if (PageHighMem(pages[i]))
1178 continue;
1179 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1165 end = start + PAGE_SIZE; 1180 end = start + PAGE_SIZE;
1166 free_memtype(start, end); 1181 free_memtype(start, end);
1167 } 1182 }
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8e43bdd45456..ed34f5e35999 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -25,7 +25,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
25 return pte; 25 return pte;
26} 26}
27 27
28void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) 28void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
29{ 29{
30 pgtable_page_dtor(pte); 30 pgtable_page_dtor(pte);
31 paravirt_release_pte(page_to_pfn(pte)); 31 paravirt_release_pte(page_to_pfn(pte));
@@ -33,14 +33,14 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
33} 33}
34 34
35#if PAGETABLE_LEVELS > 2 35#if PAGETABLE_LEVELS > 2
36void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) 36void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
37{ 37{
38 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); 38 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
39 tlb_remove_page(tlb, virt_to_page(pmd)); 39 tlb_remove_page(tlb, virt_to_page(pmd));
40} 40}
41 41
42#if PAGETABLE_LEVELS > 3 42#if PAGETABLE_LEVELS > 3
43void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 43void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
44{ 44{
45 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 45 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
46 tlb_remove_page(tlb, virt_to_page(pud)); 46 tlb_remove_page(tlb, virt_to_page(pud));
@@ -329,7 +329,6 @@ void __init reserve_top_address(unsigned long reserve)
329 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", 329 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
330 (int)-reserve); 330 (int)-reserve);
331 __FIXADDR_TOP = -reserve - PAGE_SIZE; 331 __FIXADDR_TOP = -reserve - PAGE_SIZE;
332 __VMALLOC_RESERVE += reserve;
333#endif 332#endif
334} 333}
335 334
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 2dfcbf9df2ae..dbb5381f7b3b 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -79,8 +79,10 @@ static __init void bad_srat(void)
79 acpi_numa = -1; 79 acpi_numa = -1;
80 for (i = 0; i < MAX_LOCAL_APIC; i++) 80 for (i = 0; i < MAX_LOCAL_APIC; i++)
81 apicid_to_node[i] = NUMA_NO_NODE; 81 apicid_to_node[i] = NUMA_NO_NODE;
82 for (i = 0; i < MAX_NUMNODES; i++) 82 for (i = 0; i < MAX_NUMNODES; i++) {
83 nodes_add[i].start = nodes[i].end = 0; 83 nodes[i].start = nodes[i].end = 0;
84 nodes_add[i].start = nodes_add[i].end = 0;
85 }
84 remove_all_active_ranges(); 86 remove_all_active_ranges();
85} 87}
86 88
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index b07dd8d0b321..89b9a5cd63da 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -390,7 +390,7 @@ static int __init p4_init(char **cpu_type)
390static int force_arch_perfmon; 390static int force_arch_perfmon;
391static int force_cpu_type(const char *str, struct kernel_param *kp) 391static int force_cpu_type(const char *str, struct kernel_param *kp)
392{ 392{
393 if (!strcmp(str, "archperfmon")) { 393 if (!strcmp(str, "arch_perfmon")) {
394 force_arch_perfmon = 1; 394 force_arch_perfmon = 1;
395 printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); 395 printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
396 } 396 }
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index b26626dc517c..1014eb4bfc37 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -68,6 +68,10 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
68 unsigned long flags; 68 unsigned long flags;
69 struct resource *root; 69 struct resource *root;
70 int max_root_bus_resources = PCI_BUS_NUM_RESOURCES; 70 int max_root_bus_resources = PCI_BUS_NUM_RESOURCES;
71 u64 start, end;
72
73 if (bus_has_transparent_bridge(info->bus))
74 max_root_bus_resources -= 3;
71 75
72 status = resource_to_addr(acpi_res, &addr); 76 status = resource_to_addr(acpi_res, &addr);
73 if (!ACPI_SUCCESS(status)) 77 if (!ACPI_SUCCESS(status))
@@ -84,25 +88,24 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
84 } else 88 } else
85 return AE_OK; 89 return AE_OK;
86 90
87 res = &info->res[info->res_num]; 91 start = addr.minimum + addr.translation_offset;
88 res->name = info->name; 92 end = start + addr.address_length - 1;
89 res->flags = flags;
90 res->start = addr.minimum + addr.translation_offset;
91 res->end = res->start + addr.address_length - 1;
92 res->child = NULL;
93
94 if (bus_has_transparent_bridge(info->bus))
95 max_root_bus_resources -= 3;
96 if (info->res_num >= max_root_bus_resources) { 93 if (info->res_num >= max_root_bus_resources) {
97 printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx " 94 printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx "
98 "from %s for %s due to _CRS returning more than " 95 "from %s for %s due to _CRS returning more than "
99 "%d resource descriptors\n", (unsigned long) res->start, 96 "%d resource descriptors\n", (unsigned long) start,
100 (unsigned long) res->end, root->name, info->name, 97 (unsigned long) end, root->name, info->name,
101 max_root_bus_resources); 98 max_root_bus_resources);
102 info->res_num++;
103 return AE_OK; 99 return AE_OK;
104 } 100 }
105 101
102 res = &info->res[info->res_num];
103 res->name = info->name;
104 res->flags = flags;
105 res->start = start;
106 res->end = end;
107 res->child = NULL;
108
106 if (insert_resource(root, res)) { 109 if (insert_resource(root, res)) {
107 printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx " 110 printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx "
108 "from %s for %s\n", (unsigned long) res->start, 111 "from %s for %s\n", (unsigned long) res->start,
@@ -115,23 +118,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
115} 118}
116 119
117static void 120static void
118adjust_transparent_bridge_resources(struct pci_bus *bus)
119{
120 struct pci_dev *dev;
121
122 list_for_each_entry(dev, &bus->devices, bus_list) {
123 int i;
124 u16 class = dev->class >> 8;
125
126 if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent) {
127 for(i = 3; i < PCI_BUS_NUM_RESOURCES; i++)
128 dev->subordinate->resource[i] =
129 dev->bus->resource[i - 3];
130 }
131 }
132}
133
134static void
135get_current_resources(struct acpi_device *device, int busnum, 121get_current_resources(struct acpi_device *device, int busnum,
136 int domain, struct pci_bus *bus) 122 int domain, struct pci_bus *bus)
137{ 123{
@@ -158,8 +144,6 @@ get_current_resources(struct acpi_device *device, int busnum,
158 info.res_num = 0; 144 info.res_num = 0;
159 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, 145 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
160 &info); 146 &info);
161 if (info.res_num)
162 adjust_transparent_bridge_resources(bus);
163 147
164 return; 148 return;
165 149
@@ -222,8 +206,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
222 */ 206 */
223 memcpy(bus->sysdata, sd, sizeof(*sd)); 207 memcpy(bus->sysdata, sd, sizeof(*sd));
224 kfree(sd); 208 kfree(sd);
225 } else 209 } else {
226 bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); 210 bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd);
211 if (bus) {
212 if (pci_probe & PCI_USE__CRS)
213 get_current_resources(device, busnum, domain,
214 bus);
215 bus->subordinate = pci_scan_child_bus(bus);
216 }
217 }
227 218
228 if (!bus) 219 if (!bus)
229 kfree(sd); 220 kfree(sd);
@@ -238,8 +229,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
238#endif 229#endif
239 } 230 }
240 231
241 if (bus && (pci_probe & PCI_USE__CRS))
242 get_current_resources(device, busnum, domain, bus);
243 return bus; 232 return bus;
244} 233}
245 234
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index f893d6a6e803..3ffa10df20b9 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -100,8 +100,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
100 int j; 100 int j;
101 struct pci_root_info *info; 101 struct pci_root_info *info;
102 102
103 /* don't go for it if _CRS is used */ 103 /* don't go for it if _CRS is used already */
104 if (pci_probe & PCI_USE__CRS) 104 if (b->resource[0] != &ioport_resource ||
105 b->resource[1] != &iomem_resource)
105 return; 106 return;
106 107
107 /* if only one root bus, don't need to anything */ 108 /* if only one root bus, don't need to anything */
@@ -116,6 +117,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
116 if (i == pci_root_num) 117 if (i == pci_root_num)
117 return; 118 return;
118 119
120 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
121 b->number);
122
119 info = &pci_root_info[i]; 123 info = &pci_root_info[i];
120 for (j = 0; j < info->res_num; j++) { 124 for (j = 0; j < info->res_num; j++) {
121 struct resource *res; 125 struct resource *res;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 0fb56db16d18..52e62e57fedd 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -35,6 +35,7 @@
35#include <asm/pat.h> 35#include <asm/pat.h>
36#include <asm/e820.h> 36#include <asm/e820.h>
37#include <asm/pci_x86.h> 37#include <asm/pci_x86.h>
38#include <asm/io_apic.h>
38 39
39 40
40static int 41static int
@@ -227,6 +228,12 @@ void __init pcibios_resource_survey(void)
227 pcibios_allocate_resources(1); 228 pcibios_allocate_resources(1);
228 229
229 e820_reserve_resources_late(); 230 e820_reserve_resources_late();
231 /*
232 * Insert the IO APIC resources after PCI initialization has
233 * occured to handle IO APICS that are mapped in on a BAR in
234 * PCI space, but before trying to assign unassigned pci res.
235 */
236 ioapic_insert_resources();
230} 237}
231 238
232/** 239/**
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index de2abbd07544..a6a198c33623 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -1,7 +1,7 @@
1# __restore_processor_state() restores %gs after S3 resume and so should not 1# __restore_processor_state() restores %gs after S3 resume and so should not
2# itself be stack-protected 2# itself be stack-protected
3nostackp := $(call cc-option, -fno-stack-protector) 3nostackp := $(call cc-option, -fno-stack-protector)
4CFLAGS_cpu_$(BITS).o := $(nostackp) 4CFLAGS_cpu.o := $(nostackp)
5 5
6obj-$(CONFIG_PM_SLEEP) += cpu.o 6obj-$(CONFIG_PM_SLEEP) += cpu.o
7obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o 7obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o