aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2009-09-08 20:55:54 -0400
committerDan Williams <dan.j.williams@intel.com>2009-09-08 20:55:54 -0400
commit9134d02bc0af4a8747d448d1f811ec5f8eb96df6 (patch)
tree704c3e5dcc10f360815c4868a74711f82fb62e27 /arch/x86
parentbbb20089a3275a19e475dbc21320c3742e3ca423 (diff)
parent80ffb3cceaefa405f2ecd46d66500ed8d53efe74 (diff)
Merge commit 'md/for-linus' into async-tx-next
Conflicts: drivers/md/raid5.c
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig13
-rw-r--r--arch/x86/boot/video-bios.c3
-rw-r--r--arch/x86/boot/video-vesa.c4
-rw-r--r--arch/x86/include/asm/atomic_32.h185
-rw-r--r--arch/x86/include/asm/atomic_64.h42
-rw-r--r--arch/x86/include/asm/boot.h6
-rw-r--r--arch/x86/include/asm/efi.h5
-rw-r--r--arch/x86/include/asm/fixmap.h10
-rw-r--r--arch/x86/include/asm/io_apic.h2
-rw-r--r--arch/x86/include/asm/irqflags.h8
-rw-r--r--arch/x86/include/asm/lguest.h3
-rw-r--r--arch/x86/include/asm/lguest_hcall.h18
-rw-r--r--arch/x86/include/asm/msr-index.h4
-rw-r--r--arch/x86/include/asm/nmi.h1
-rw-r--r--arch/x86/include/asm/pci.h2
-rw-r--r--arch/x86/include/asm/percpu.h10
-rw-r--r--arch/x86/include/asm/perf_counter.h3
-rw-r--r--arch/x86/include/asm/pgalloc.h25
-rw-r--r--arch/x86/include/asm/proto.h11
-rw-r--r--arch/x86/include/asm/spinlock.h4
-rw-r--r--arch/x86/include/asm/stacktrace.h2
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/include/asm/uaccess.h4
-rw-r--r--arch/x86/include/asm/uaccess_64.h10
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h9
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/amd_iommu.c4
-rw-r--r--arch/x86/kernel/amd_iommu_init.c13
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/es7000_32.c3
-rw-r--r--arch/x86/kernel/apic/io_apic.c47
-rw-r--r--arch/x86/kernel/apic/numaq_32.c3
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c38
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c6
-rw-r--r--arch/x86/kernel/cpu/common.c3
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c32
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c22
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c283
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c5
-rw-r--r--arch/x86/kernel/dumpstack.c1
-rw-r--r--arch/x86/kernel/dumpstack_32.c6
-rw-r--r--arch/x86/kernel/dumpstack_64.c22
-rw-r--r--arch/x86/kernel/e820.c23
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/efi_64.c6
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/mfgpt_32.c2
-rw-r--r--arch/x86/kernel/pci-dma.c2
-rw-r--r--arch/x86/kernel/pci-gart_64.c2
-rw-r--r--arch/x86/kernel/pvclock.c2
-rw-r--r--arch/x86/kernel/reboot.c42
-rw-r--r--arch/x86/kernel/setup.c29
-rw-r--r--arch/x86/kernel/setup_percpu.c219
-rw-r--r--arch/x86/kernel/tlb_uv.c9
-rw-r--r--arch/x86/kernel/traps.c3
-rw-r--r--arch/x86/kernel/vmlinux.lds.S23
-rw-r--r--arch/x86/kvm/mmu.c6
-rw-r--r--arch/x86/kvm/paging_tmpl.h2
-rw-r--r--arch/x86/kvm/vmx.c15
-rw-r--r--arch/x86/kvm/x86.c1
-rw-r--r--arch/x86/kvm/x86_emulate.c2
-rw-r--r--arch/x86/lguest/boot.c510
-rw-r--r--arch/x86/lguest/i386_head.S112
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/atomic64_32.c230
-rw-r--r--arch/x86/lib/clear_page_64.S5
-rw-r--r--arch/x86/lib/copy_user_64.S1
-rw-r--r--arch/x86/lib/delay.c3
-rw-r--r--arch/x86/lib/msr.c26
-rw-r--r--arch/x86/lib/usercopy_32.c2
-rw-r--r--arch/x86/mm/fault.c11
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--arch/x86/mm/init.c18
-rw-r--r--arch/x86/mm/init_64.c9
-rw-r--r--arch/x86/mm/pageattr.c104
-rw-r--r--arch/x86/mm/pgtable.c7
-rw-r--r--arch/x86/mm/srat_64.c6
-rw-r--r--arch/x86/oprofile/nmi_int.c2
-rw-r--r--arch/x86/pci/acpi.c59
-rw-r--r--arch/x86/pci/amd_bus.c8
-rw-r--r--arch/x86/pci/i386.c7
-rw-r--r--arch/x86/power/Makefile2
-rw-r--r--arch/x86/power/cpu.c2
89 files changed, 1638 insertions, 775 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d1430ef6b4f9..738bdc6b0f8b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1913,25 +1913,26 @@ config DMAR_DEFAULT_ON
1913 recommended you say N here while the DMAR code remains 1913 recommended you say N here while the DMAR code remains
1914 experimental. 1914 experimental.
1915 1915
1916config DMAR_GFX_WA 1916config DMAR_BROKEN_GFX_WA
1917 def_bool y 1917 def_bool n
1918 prompt "Support for Graphics workaround" 1918 prompt "Workaround broken graphics drivers (going away soon)"
1919 depends on DMAR 1919 depends on DMAR
1920 ---help--- 1920 ---help---
1921 Current Graphics drivers tend to use physical address 1921 Current Graphics drivers tend to use physical address
1922 for DMA and avoid using DMA APIs. Setting this config 1922 for DMA and avoid using DMA APIs. Setting this config
1923 option permits the IOMMU driver to set a unity map for 1923 option permits the IOMMU driver to set a unity map for
1924 all the OS-visible memory. Hence the driver can continue 1924 all the OS-visible memory. Hence the driver can continue
1925 to use physical addresses for DMA. 1925 to use physical addresses for DMA, at least until this
1926 option is removed in the 2.6.32 kernel.
1926 1927
1927config DMAR_FLOPPY_WA 1928config DMAR_FLOPPY_WA
1928 def_bool y 1929 def_bool y
1929 depends on DMAR 1930 depends on DMAR
1930 ---help--- 1931 ---help---
1931 Floppy disk drivers are know to bypass DMA API calls 1932 Floppy disk drivers are known to bypass DMA API calls
1932 thereby failing to work when IOMMU is enabled. This 1933 thereby failing to work when IOMMU is enabled. This
1933 workaround will setup a 1:1 mapping for the first 1934 workaround will setup a 1:1 mapping for the first
1934 16M to make floppy (an ISA device) work. 1935 16MiB to make floppy (an ISA device) work.
1935 1936
1936config INTR_REMAP 1937config INTR_REMAP
1937 bool "Support for Interrupt Remapping (EXPERIMENTAL)" 1938 bool "Support for Interrupt Remapping (EXPERIMENTAL)"
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index d660be492363..49e0c18833e0 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -37,14 +37,13 @@ static int set_bios_mode(u8 mode)
37 ireg.al = mode; /* AH=0x00 Set Video Mode */ 37 ireg.al = mode; /* AH=0x00 Set Video Mode */
38 intcall(0x10, &ireg, NULL); 38 intcall(0x10, &ireg, NULL);
39 39
40
41 ireg.ah = 0x0f; /* Get Current Video Mode */ 40 ireg.ah = 0x0f; /* Get Current Video Mode */
42 intcall(0x10, &ireg, &oreg); 41 intcall(0x10, &ireg, &oreg);
43 42
44 do_restore = 1; /* Assume video contents were lost */ 43 do_restore = 1; /* Assume video contents were lost */
45 44
46 /* Not all BIOSes are clean with the top bit */ 45 /* Not all BIOSes are clean with the top bit */
47 new_mode = ireg.al & 0x7f; 46 new_mode = oreg.al & 0x7f;
48 47
49 if (new_mode == mode) 48 if (new_mode == mode)
50 return 0; /* Mode change OK */ 49 return 0; /* Mode change OK */
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index c700147d6ffb..275dd177f198 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -45,7 +45,7 @@ static int vesa_probe(void)
45 ireg.di = (size_t)&vginfo; 45 ireg.di = (size_t)&vginfo;
46 intcall(0x10, &ireg, &oreg); 46 intcall(0x10, &ireg, &oreg);
47 47
48 if (ireg.ax != 0x004f || 48 if (oreg.ax != 0x004f ||
49 vginfo.signature != VESA_MAGIC || 49 vginfo.signature != VESA_MAGIC ||
50 vginfo.version < 0x0102) 50 vginfo.version < 0x0102)
51 return 0; /* Not present */ 51 return 0; /* Not present */
@@ -70,7 +70,7 @@ static int vesa_probe(void)
70 ireg.di = (size_t)&vminfo; 70 ireg.di = (size_t)&vminfo;
71 intcall(0x10, &ireg, &oreg); 71 intcall(0x10, &ireg, &oreg);
72 72
73 if (ireg.ax != 0x004f) 73 if (oreg.ax != 0x004f)
74 continue; 74 continue;
75 75
76 if ((vminfo.mode_attr & 0x15) == 0x05) { 76 if ((vminfo.mode_attr & 0x15) == 0x05) {
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 2503d4e64c2a..dc5a667ff791 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -19,7 +19,10 @@
19 * 19 *
20 * Atomically reads the value of @v. 20 * Atomically reads the value of @v.
21 */ 21 */
22#define atomic_read(v) ((v)->counter) 22static inline int atomic_read(const atomic_t *v)
23{
24 return v->counter;
25}
23 26
24/** 27/**
25 * atomic_set - set atomic variable 28 * atomic_set - set atomic variable
@@ -28,7 +31,10 @@
28 * 31 *
29 * Atomically sets the value of @v to @i. 32 * Atomically sets the value of @v to @i.
30 */ 33 */
31#define atomic_set(v, i) (((v)->counter) = (i)) 34static inline void atomic_set(atomic_t *v, int i)
35{
36 v->counter = i;
37}
32 38
33/** 39/**
34 * atomic_add - add integer to atomic variable 40 * atomic_add - add integer to atomic variable
@@ -200,8 +206,15 @@ static inline int atomic_sub_return(int i, atomic_t *v)
200 return atomic_add_return(-i, v); 206 return atomic_add_return(-i, v);
201} 207}
202 208
203#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) 209static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
204#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) 210{
211 return cmpxchg(&v->counter, old, new);
212}
213
214static inline int atomic_xchg(atomic_t *v, int new)
215{
216 return xchg(&v->counter, new);
217}
205 218
206/** 219/**
207 * atomic_add_unless - add unless the number is already a given value 220 * atomic_add_unless - add unless the number is already a given value
@@ -250,45 +263,12 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
250/* An 64bit atomic type */ 263/* An 64bit atomic type */
251 264
252typedef struct { 265typedef struct {
253 unsigned long long counter; 266 u64 __aligned(8) counter;
254} atomic64_t; 267} atomic64_t;
255 268
256#define ATOMIC64_INIT(val) { (val) } 269#define ATOMIC64_INIT(val) { (val) }
257 270
258/** 271extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
259 * atomic64_read - read atomic64 variable
260 * @ptr: pointer of type atomic64_t
261 *
262 * Atomically reads the value of @v.
263 * Doesn't imply a read memory barrier.
264 */
265#define __atomic64_read(ptr) ((ptr)->counter)
266
267static inline unsigned long long
268cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
269{
270 asm volatile(
271
272 LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
273
274 : "=A" (old)
275
276 : [ptr] "D" (ptr),
277 "A" (old),
278 "b" (ll_low(new)),
279 "c" (ll_high(new))
280
281 : "memory");
282
283 return old;
284}
285
286static inline unsigned long long
287atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
288 unsigned long long new_val)
289{
290 return cmpxchg8b(&ptr->counter, old_val, new_val);
291}
292 272
293/** 273/**
294 * atomic64_xchg - xchg atomic64 variable 274 * atomic64_xchg - xchg atomic64 variable
@@ -298,18 +278,7 @@ atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
298 * Atomically xchgs the value of @ptr to @new_val and returns 278 * Atomically xchgs the value of @ptr to @new_val and returns
299 * the old value. 279 * the old value.
300 */ 280 */
301 281extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
302static inline unsigned long long
303atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
304{
305 unsigned long long old_val;
306
307 do {
308 old_val = atomic_read(ptr);
309 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
310
311 return old_val;
312}
313 282
314/** 283/**
315 * atomic64_set - set atomic64 variable 284 * atomic64_set - set atomic64 variable
@@ -318,10 +287,7 @@ atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
318 * 287 *
319 * Atomically sets the value of @ptr to @new_val. 288 * Atomically sets the value of @ptr to @new_val.
320 */ 289 */
321static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) 290extern void atomic64_set(atomic64_t *ptr, u64 new_val);
322{
323 atomic64_xchg(ptr, new_val);
324}
325 291
326/** 292/**
327 * atomic64_read - read atomic64 variable 293 * atomic64_read - read atomic64 variable
@@ -329,17 +295,30 @@ static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
329 * 295 *
330 * Atomically reads the value of @ptr and returns it. 296 * Atomically reads the value of @ptr and returns it.
331 */ 297 */
332static inline unsigned long long atomic64_read(atomic64_t *ptr) 298static inline u64 atomic64_read(atomic64_t *ptr)
333{ 299{
334 unsigned long long curr_val; 300 u64 res;
335 301
336 do { 302 /*
337 curr_val = __atomic64_read(ptr); 303 * Note, we inline this atomic64_t primitive because
338 } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); 304 * it only clobbers EAX/EDX and leaves the others
339 305 * untouched. We also (somewhat subtly) rely on the
340 return curr_val; 306 * fact that cmpxchg8b returns the current 64-bit value
307 * of the memory location we are touching:
308 */
309 asm volatile(
310 "mov %%ebx, %%eax\n\t"
311 "mov %%ecx, %%edx\n\t"
312 LOCK_PREFIX "cmpxchg8b %1\n"
313 : "=&A" (res)
314 : "m" (*ptr)
315 );
316
317 return res;
341} 318}
342 319
320extern u64 atomic64_read(atomic64_t *ptr);
321
343/** 322/**
344 * atomic64_add_return - add and return 323 * atomic64_add_return - add and return
345 * @delta: integer value to add 324 * @delta: integer value to add
@@ -347,34 +326,14 @@ static inline unsigned long long atomic64_read(atomic64_t *ptr)
347 * 326 *
348 * Atomically adds @delta to @ptr and returns @delta + *@ptr 327 * Atomically adds @delta to @ptr and returns @delta + *@ptr
349 */ 328 */
350static inline unsigned long long 329extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
351atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
352{
353 unsigned long long old_val, new_val;
354
355 do {
356 old_val = atomic_read(ptr);
357 new_val = old_val + delta;
358
359 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
360
361 return new_val;
362}
363
364static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
365{
366 return atomic64_add_return(-delta, ptr);
367}
368 330
369static inline long atomic64_inc_return(atomic64_t *ptr) 331/*
370{ 332 * Other variants with different arithmetic operators:
371 return atomic64_add_return(1, ptr); 333 */
372} 334extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
373 335extern u64 atomic64_inc_return(atomic64_t *ptr);
374static inline long atomic64_dec_return(atomic64_t *ptr) 336extern u64 atomic64_dec_return(atomic64_t *ptr);
375{
376 return atomic64_sub_return(1, ptr);
377}
378 337
379/** 338/**
380 * atomic64_add - add integer to atomic64 variable 339 * atomic64_add - add integer to atomic64 variable
@@ -383,10 +342,7 @@ static inline long atomic64_dec_return(atomic64_t *ptr)
383 * 342 *
384 * Atomically adds @delta to @ptr. 343 * Atomically adds @delta to @ptr.
385 */ 344 */
386static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) 345extern void atomic64_add(u64 delta, atomic64_t *ptr);
387{
388 atomic64_add_return(delta, ptr);
389}
390 346
391/** 347/**
392 * atomic64_sub - subtract the atomic64 variable 348 * atomic64_sub - subtract the atomic64 variable
@@ -395,10 +351,7 @@ static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
395 * 351 *
396 * Atomically subtracts @delta from @ptr. 352 * Atomically subtracts @delta from @ptr.
397 */ 353 */
398static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) 354extern void atomic64_sub(u64 delta, atomic64_t *ptr);
399{
400 atomic64_add(-delta, ptr);
401}
402 355
403/** 356/**
404 * atomic64_sub_and_test - subtract value from variable and test result 357 * atomic64_sub_and_test - subtract value from variable and test result
@@ -409,13 +362,7 @@ static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
409 * true if the result is zero, or false for all 362 * true if the result is zero, or false for all
410 * other cases. 363 * other cases.
411 */ 364 */
412static inline int 365extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
413atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
414{
415 unsigned long long old_val = atomic64_sub_return(delta, ptr);
416
417 return old_val == 0;
418}
419 366
420/** 367/**
421 * atomic64_inc - increment atomic64 variable 368 * atomic64_inc - increment atomic64 variable
@@ -423,10 +370,7 @@ atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
423 * 370 *
424 * Atomically increments @ptr by 1. 371 * Atomically increments @ptr by 1.
425 */ 372 */
426static inline void atomic64_inc(atomic64_t *ptr) 373extern void atomic64_inc(atomic64_t *ptr);
427{
428 atomic64_add(1, ptr);
429}
430 374
431/** 375/**
432 * atomic64_dec - decrement atomic64 variable 376 * atomic64_dec - decrement atomic64 variable
@@ -434,10 +378,7 @@ static inline void atomic64_inc(atomic64_t *ptr)
434 * 378 *
435 * Atomically decrements @ptr by 1. 379 * Atomically decrements @ptr by 1.
436 */ 380 */
437static inline void atomic64_dec(atomic64_t *ptr) 381extern void atomic64_dec(atomic64_t *ptr);
438{
439 atomic64_sub(1, ptr);
440}
441 382
442/** 383/**
443 * atomic64_dec_and_test - decrement and test 384 * atomic64_dec_and_test - decrement and test
@@ -447,10 +388,7 @@ static inline void atomic64_dec(atomic64_t *ptr)
447 * returns true if the result is 0, or false for all other 388 * returns true if the result is 0, or false for all other
448 * cases. 389 * cases.
449 */ 390 */
450static inline int atomic64_dec_and_test(atomic64_t *ptr) 391extern int atomic64_dec_and_test(atomic64_t *ptr);
451{
452 return atomic64_sub_and_test(1, ptr);
453}
454 392
455/** 393/**
456 * atomic64_inc_and_test - increment and test 394 * atomic64_inc_and_test - increment and test
@@ -460,10 +398,7 @@ static inline int atomic64_dec_and_test(atomic64_t *ptr)
460 * and returns true if the result is zero, or false for all 398 * and returns true if the result is zero, or false for all
461 * other cases. 399 * other cases.
462 */ 400 */
463static inline int atomic64_inc_and_test(atomic64_t *ptr) 401extern int atomic64_inc_and_test(atomic64_t *ptr);
464{
465 return atomic64_sub_and_test(-1, ptr);
466}
467 402
468/** 403/**
469 * atomic64_add_negative - add and test if negative 404 * atomic64_add_negative - add and test if negative
@@ -474,13 +409,7 @@ static inline int atomic64_inc_and_test(atomic64_t *ptr)
474 * if the result is negative, or false when 409 * if the result is negative, or false when
475 * result is greater than or equal to zero. 410 * result is greater than or equal to zero.
476 */ 411 */
477static inline int 412extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
478atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
479{
480 long long old_val = atomic64_add_return(delta, ptr);
481
482 return old_val < 0;
483}
484 413
485#include <asm-generic/atomic-long.h> 414#include <asm-generic/atomic-long.h>
486#endif /* _ASM_X86_ATOMIC_32_H */ 415#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
index 0d6360220007..d605dc268e79 100644
--- a/arch/x86/include/asm/atomic_64.h
+++ b/arch/x86/include/asm/atomic_64.h
@@ -18,7 +18,10 @@
18 * 18 *
19 * Atomically reads the value of @v. 19 * Atomically reads the value of @v.
20 */ 20 */
21#define atomic_read(v) ((v)->counter) 21static inline int atomic_read(const atomic_t *v)
22{
23 return v->counter;
24}
22 25
23/** 26/**
24 * atomic_set - set atomic variable 27 * atomic_set - set atomic variable
@@ -27,7 +30,10 @@
27 * 30 *
28 * Atomically sets the value of @v to @i. 31 * Atomically sets the value of @v to @i.
29 */ 32 */
30#define atomic_set(v, i) (((v)->counter) = (i)) 33static inline void atomic_set(atomic_t *v, int i)
34{
35 v->counter = i;
36}
31 37
32/** 38/**
33 * atomic_add - add integer to atomic variable 39 * atomic_add - add integer to atomic variable
@@ -192,7 +198,10 @@ static inline int atomic_sub_return(int i, atomic_t *v)
192 * Atomically reads the value of @v. 198 * Atomically reads the value of @v.
193 * Doesn't imply a read memory barrier. 199 * Doesn't imply a read memory barrier.
194 */ 200 */
195#define atomic64_read(v) ((v)->counter) 201static inline long atomic64_read(const atomic64_t *v)
202{
203 return v->counter;
204}
196 205
197/** 206/**
198 * atomic64_set - set atomic64 variable 207 * atomic64_set - set atomic64 variable
@@ -201,7 +210,10 @@ static inline int atomic_sub_return(int i, atomic_t *v)
201 * 210 *
202 * Atomically sets the value of @v to @i. 211 * Atomically sets the value of @v to @i.
203 */ 212 */
204#define atomic64_set(v, i) (((v)->counter) = (i)) 213static inline void atomic64_set(atomic64_t *v, long i)
214{
215 v->counter = i;
216}
205 217
206/** 218/**
207 * atomic64_add - add integer to atomic64 variable 219 * atomic64_add - add integer to atomic64 variable
@@ -355,11 +367,25 @@ static inline long atomic64_sub_return(long i, atomic64_t *v)
355#define atomic64_inc_return(v) (atomic64_add_return(1, (v))) 367#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
356#define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) 368#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
357 369
358#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) 370static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
359#define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) 371{
372 return cmpxchg(&v->counter, old, new);
373}
374
375static inline long atomic64_xchg(atomic64_t *v, long new)
376{
377 return xchg(&v->counter, new);
378}
360 379
361#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) 380static inline long atomic_cmpxchg(atomic_t *v, int old, int new)
362#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) 381{
382 return cmpxchg(&v->counter, old, new);
383}
384
385static inline long atomic_xchg(atomic_t *v, int new)
386{
387 return xchg(&v->counter, new);
388}
363 389
364/** 390/**
365 * atomic_add_unless - add unless the number is a given value 391 * atomic_add_unless - add unless the number is a given value
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 418e632d4a80..7a1065958ba9 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -8,7 +8,7 @@
8 8
9#ifdef __KERNEL__ 9#ifdef __KERNEL__
10 10
11#include <asm/page_types.h> 11#include <asm/pgtable_types.h>
12 12
13/* Physical address where kernel should be loaded. */ 13/* Physical address where kernel should be loaded. */
14#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ 14#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
@@ -16,10 +16,10 @@
16 & ~(CONFIG_PHYSICAL_ALIGN - 1)) 16 & ~(CONFIG_PHYSICAL_ALIGN - 1))
17 17
18/* Minimum kernel alignment, as a power of two */ 18/* Minimum kernel alignment, as a power of two */
19#ifdef CONFIG_x86_64 19#ifdef CONFIG_X86_64
20#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT 20#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
21#else 21#else
22#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT+1) 22#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_ORDER)
23#endif 23#endif
24#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) 24#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
25 25
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index edc90f23e708..8406ed7f9926 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -33,7 +33,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
33#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ 33#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
34 efi_call_virt(f, a1, a2, a3, a4, a5, a6) 34 efi_call_virt(f, a1, a2, a3, a4, a5, a6)
35 35
36#define efi_ioremap(addr, size) ioremap_cache(addr, size) 36#define efi_ioremap(addr, size, type) ioremap_cache(addr, size)
37 37
38#else /* !CONFIG_X86_32 */ 38#else /* !CONFIG_X86_32 */
39 39
@@ -84,7 +84,8 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
84 efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ 84 efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
85 (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) 85 (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
86 86
87extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size); 87extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
88 u32 type);
88 89
89#endif /* CONFIG_X86_32 */ 90#endif /* CONFIG_X86_32 */
90 91
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 2d81af3974a0..7b2d71df39a6 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -111,12 +111,9 @@ enum fixed_addresses {
111#ifdef CONFIG_PARAVIRT 111#ifdef CONFIG_PARAVIRT
112 FIX_PARAVIRT_BOOTMAP, 112 FIX_PARAVIRT_BOOTMAP,
113#endif 113#endif
114 FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */ 114 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
115 FIX_TEXT_POKE1, 115 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
116 __end_of_permanent_fixed_addresses, 116 __end_of_permanent_fixed_addresses,
117#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
118 FIX_OHCI1394_BASE,
119#endif
120 /* 117 /*
121 * 256 temporary boot-time mappings, used by early_ioremap(), 118 * 256 temporary boot-time mappings, used by early_ioremap(),
122 * before ioremap() is functional. 119 * before ioremap() is functional.
@@ -129,6 +126,9 @@ enum fixed_addresses {
129 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - 126 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
130 (__end_of_permanent_fixed_addresses & 255), 127 (__end_of_permanent_fixed_addresses & 255),
131 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, 128 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
129#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
130 FIX_OHCI1394_BASE,
131#endif
132#ifdef CONFIG_X86_32 132#ifdef CONFIG_X86_32
133 FIX_WP_TEST, 133 FIX_WP_TEST,
134#endif 134#endif
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index daf866ed0612..330ee807f89e 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -161,6 +161,7 @@ extern int io_apic_set_pci_routing(struct device *dev, int irq,
161 struct io_apic_irq_attr *irq_attr); 161 struct io_apic_irq_attr *irq_attr);
162extern int (*ioapic_renumber_irq)(int ioapic, int irq); 162extern int (*ioapic_renumber_irq)(int ioapic, int irq);
163extern void ioapic_init_mappings(void); 163extern void ioapic_init_mappings(void);
164extern void ioapic_insert_resources(void);
164 165
165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); 166extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
166extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); 167extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
@@ -180,6 +181,7 @@ extern void ioapic_write_entry(int apic, int pin,
180#define io_apic_assign_pci_irqs 0 181#define io_apic_assign_pci_irqs 0
181static const int timer_through_8259 = 0; 182static const int timer_through_8259 = 0;
182static inline void ioapic_init_mappings(void) { } 183static inline void ioapic_init_mappings(void) { }
184static inline void ioapic_insert_resources(void) { }
183 185
184static inline void probe_nr_irqs_gsi(void) { } 186static inline void probe_nr_irqs_gsi(void) { }
185#endif 187#endif
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 2bdab21f0898..c6ccbe7e81ad 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -12,9 +12,15 @@ static inline unsigned long native_save_fl(void)
12{ 12{
13 unsigned long flags; 13 unsigned long flags;
14 14
15 /*
16 * Note: this needs to be "=r" not "=rm", because we have the
17 * stack offset from what gcc expects at the time the "pop" is
18 * executed, and so a memory reference with respect to the stack
19 * would end up using the wrong address.
20 */
15 asm volatile("# __raw_save_flags\n\t" 21 asm volatile("# __raw_save_flags\n\t"
16 "pushf ; pop %0" 22 "pushf ; pop %0"
17 : "=g" (flags) 23 : "=r" (flags)
18 : /* no input */ 24 : /* no input */
19 : "memory"); 25 : "memory");
20 26
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 313389cd50d2..5136dad57cbb 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -17,8 +17,7 @@
17/* Pages for switcher itself, then two pages per cpu */ 17/* Pages for switcher itself, then two pages per cpu */
18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) 18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
19 19
20/* We map at -4M (-2M when PAE is activated) for ease of mapping 20/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */
21 * into the guest (one PTE page). */
22#ifdef CONFIG_X86_PAE 21#ifdef CONFIG_X86_PAE
23#define SWITCHER_ADDR 0xFFE00000 22#define SWITCHER_ADDR 0xFFE00000
24#else 23#else
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index d31c4a684078..ba0eed8aa1a6 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -30,27 +30,27 @@
30#include <asm/hw_irq.h> 30#include <asm/hw_irq.h>
31#include <asm/kvm_para.h> 31#include <asm/kvm_para.h>
32 32
33/*G:031 But first, how does our Guest contact the Host to ask for privileged 33/*G:030
34 * But first, how does our Guest contact the Host to ask for privileged
34 * operations? There are two ways: the direct way is to make a "hypercall", 35 * operations? There are two ways: the direct way is to make a "hypercall",
35 * to make requests of the Host Itself. 36 * to make requests of the Host Itself.
36 * 37 *
37 * We use the KVM hypercall mechanism. Seventeen hypercalls are 38 * We use the KVM hypercall mechanism, though completely different hypercall
38 * available: the hypercall number is put in the %eax register, and the 39 * numbers. Seventeen hypercalls are available: the hypercall number is put in
39 * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. 40 * the %eax register, and the arguments (when required) are placed in %ebx,
40 * If a return value makes sense, it's returned in %eax. 41 * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax.
41 * 42 *
42 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 43 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
43 * Host, rather than returning failure. This reflects Winston Churchill's 44 * Host, rather than returning failure. This reflects Winston Churchill's
44 * definition of a gentleman: "someone who is only rude intentionally". */ 45 * definition of a gentleman: "someone who is only rude intentionally".
45/*:*/ 46:*/
46 47
47/* Can't use our min() macro here: needs to be a constant */ 48/* Can't use our min() macro here: needs to be a constant */
48#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 49#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
49 50
50#define LHCALL_RING_SIZE 64 51#define LHCALL_RING_SIZE 64
51struct hcall_args { 52struct hcall_args {
52 /* These map directly onto eax, ebx, ecx, edx and esi 53 /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
53 * in struct lguest_regs */
54 unsigned long arg0, arg1, arg2, arg3, arg4; 54 unsigned long arg0, arg1, arg2, arg3, arg4;
55}; 55};
56 56
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1692fb5050e3..6be7fc254b59 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -246,10 +246,6 @@
246#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38) 246#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38)
247#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39) 247#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39)
248 248
249/* Intel Model 6 */
250#define MSR_P6_EVNTSEL0 0x00000186
251#define MSR_P6_EVNTSEL1 0x00000187
252
253/* P4/Xeon+ specific */ 249/* P4/Xeon+ specific */
254#define MSR_IA32_MCG_EAX 0x00000180 250#define MSR_IA32_MCG_EAX 0x00000180
255#define MSR_IA32_MCG_EBX 0x00000181 251#define MSR_IA32_MCG_EBX 0x00000181
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c97264409934..c86e5ed4af51 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -72,7 +72,6 @@ void lapic_watchdog_stop(void);
72int lapic_watchdog_init(unsigned nmi_hz); 72int lapic_watchdog_init(unsigned nmi_hz);
73int lapic_wd_event(unsigned nmi_hz); 73int lapic_wd_event(unsigned nmi_hz);
74unsigned lapic_adjust_nmi_hz(unsigned hz); 74unsigned lapic_adjust_nmi_hz(unsigned hz);
75int lapic_watchdog_ok(void);
76void disable_lapic_nmi_watchdog(void); 75void disable_lapic_nmi_watchdog(void);
77void enable_lapic_nmi_watchdog(void); 76void enable_lapic_nmi_watchdog(void);
78void stop_nmi(void); 77void stop_nmi(void);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 927958d13c19..1ff685ca221c 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -91,7 +91,7 @@ extern void pci_iommu_alloc(void);
91 91
92#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) 92#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
93 93
94#if defined(CONFIG_X86_64) || defined(CONFIG_DMA_API_DEBUG) 94#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG)
95 95
96#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ 96#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
97 dma_addr_t ADDR_NAME; 97 dma_addr_t ADDR_NAME;
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 02ecb30982a3..103f1ddb0d85 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -42,6 +42,7 @@
42 42
43#else /* ...!ASSEMBLY */ 43#else /* ...!ASSEMBLY */
44 44
45#include <linux/kernel.h>
45#include <linux/stringify.h> 46#include <linux/stringify.h>
46 47
47#ifdef CONFIG_SMP 48#ifdef CONFIG_SMP
@@ -155,6 +156,15 @@ do { \
155/* We can use this directly for local CPU (faster). */ 156/* We can use this directly for local CPU (faster). */
156DECLARE_PER_CPU(unsigned long, this_cpu_off); 157DECLARE_PER_CPU(unsigned long, this_cpu_off);
157 158
159#ifdef CONFIG_NEED_MULTIPLE_NODES
160void *pcpu_lpage_remapped(void *kaddr);
161#else
162static inline void *pcpu_lpage_remapped(void *kaddr)
163{
164 return NULL;
165}
166#endif
167
158#endif /* !__ASSEMBLY__ */ 168#endif /* !__ASSEMBLY__ */
159 169
160#ifdef CONFIG_SMP 170#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 5fb33e160ea0..fa64e401589d 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -87,6 +87,9 @@ union cpuid10_edx {
87#ifdef CONFIG_PERF_COUNTERS 87#ifdef CONFIG_PERF_COUNTERS
88extern void init_hw_perf_counters(void); 88extern void init_hw_perf_counters(void);
89extern void perf_counters_lapic_init(void); 89extern void perf_counters_lapic_init(void);
90
91#define PERF_COUNTER_INDEX_OFFSET 0
92
90#else 93#else
91static inline void init_hw_perf_counters(void) { } 94static inline void init_hw_perf_counters(void) { }
92static inline void perf_counters_lapic_init(void) { } 95static inline void perf_counters_lapic_init(void) { }
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index dd14c54ac718..0e8c2a0fd922 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -46,7 +46,13 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte)
46 __free_page(pte); 46 __free_page(pte);
47} 47}
48 48
49extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte); 49extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
50
51static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
52 unsigned long address)
53{
54 ___pte_free_tlb(tlb, pte);
55}
50 56
51static inline void pmd_populate_kernel(struct mm_struct *mm, 57static inline void pmd_populate_kernel(struct mm_struct *mm,
52 pmd_t *pmd, pte_t *pte) 58 pmd_t *pmd, pte_t *pte)
@@ -78,7 +84,13 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
78 free_page((unsigned long)pmd); 84 free_page((unsigned long)pmd);
79} 85}
80 86
81extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); 87extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
88
89static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
90 unsigned long adddress)
91{
92 ___pmd_free_tlb(tlb, pmd);
93}
82 94
83#ifdef CONFIG_X86_PAE 95#ifdef CONFIG_X86_PAE
84extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); 96extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
@@ -108,7 +120,14 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
108 free_page((unsigned long)pud); 120 free_page((unsigned long)pud);
109} 121}
110 122
111extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); 123extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
124
125static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
126 unsigned long address)
127{
128 ___pud_free_tlb(tlb, pud);
129}
130
112#endif /* PAGETABLE_LEVELS > 3 */ 131#endif /* PAGETABLE_LEVELS > 3 */
113#endif /* PAGETABLE_LEVELS > 2 */ 132#endif /* PAGETABLE_LEVELS > 2 */
114 133
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 49fb3ecf3bb3..621f56d73121 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -22,7 +22,14 @@ extern int reboot_force;
22 22
23long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); 23long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
24 24
25#define round_up(x, y) (((x) + (y) - 1) & ~((y) - 1)) 25/*
26#define round_down(x, y) ((x) & ~((y) - 1)) 26 * This looks more complex than it should be. But we need to
27 * get the type for the ~ right in round_down (it needs to be
28 * as wide as the result!), and we want to evaluate the macro
29 * arguments just once each.
30 */
31#define __round_mask(x,y) ((__typeof__(x))((y)-1))
32#define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1)
33#define round_down(x,y) ((x) & ~__round_mask(x,y))
27 34
28#endif /* _ASM_X86_PROTO_H */ 35#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index b7e5db876399..4e77853321db 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -302,4 +302,8 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
302#define _raw_read_relax(lock) cpu_relax() 302#define _raw_read_relax(lock) cpu_relax()
303#define _raw_write_relax(lock) cpu_relax() 303#define _raw_write_relax(lock) cpu_relax()
304 304
305/* The {read|write|spin}_lock() on x86 are full memory barriers. */
306static inline void smp_mb__after_lock(void) { }
307#define ARCH_HAS_SMP_MB_AFTER_LOCK
308
305#endif /* _ASM_X86_SPINLOCK_H */ 309#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index f517944b2b17..cf86a5e73815 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -3,6 +3,8 @@
3 3
4extern int kstack_depth_to_print; 4extern int kstack_depth_to_print;
5 5
6int x86_is_stack_id(int id, char *name);
7
6/* Generic stack tracer with callbacks */ 8/* Generic stack tracer with callbacks */
7 9
8struct stacktrace_ops { 10struct stacktrace_ops {
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index b0783520988b..fad7d40b75f8 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -49,7 +49,7 @@ struct thread_info {
49 .exec_domain = &default_exec_domain, \ 49 .exec_domain = &default_exec_domain, \
50 .flags = 0, \ 50 .flags = 0, \
51 .cpu = 0, \ 51 .cpu = 0, \
52 .preempt_count = 1, \ 52 .preempt_count = INIT_PREEMPT_COUNT, \
53 .addr_limit = KERNEL_DS, \ 53 .addr_limit = KERNEL_DS, \
54 .restart_block = { \ 54 .restart_block = { \
55 .fn = do_no_restart_syscall, \ 55 .fn = do_no_restart_syscall, \
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 20e6a795e160..d2c6c930b491 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -212,9 +212,9 @@ extern int __get_user_bad(void);
212 : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") 212 : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
213#else 213#else
214#define __put_user_asm_u64(x, ptr, retval, errret) \ 214#define __put_user_asm_u64(x, ptr, retval, errret) \
215 __put_user_asm(x, ptr, retval, "q", "", "Zr", errret) 215 __put_user_asm(x, ptr, retval, "q", "", "er", errret)
216#define __put_user_asm_ex_u64(x, addr) \ 216#define __put_user_asm_ex_u64(x, addr) \
217 __put_user_asm_ex(x, addr, "q", "", "Zr") 217 __put_user_asm_ex(x, addr, "q", "", "er")
218#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) 218#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
219#endif 219#endif
220 220
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 8cc687326eb8..db24b215fc50 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -88,11 +88,11 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
88 ret, "l", "k", "ir", 4); 88 ret, "l", "k", "ir", 4);
89 return ret; 89 return ret;
90 case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, 90 case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
91 ret, "q", "", "ir", 8); 91 ret, "q", "", "er", 8);
92 return ret; 92 return ret;
93 case 10: 93 case 10:
94 __put_user_asm(*(u64 *)src, (u64 __user *)dst, 94 __put_user_asm(*(u64 *)src, (u64 __user *)dst,
95 ret, "q", "", "ir", 10); 95 ret, "q", "", "er", 10);
96 if (unlikely(ret)) 96 if (unlikely(ret))
97 return ret; 97 return ret;
98 asm("":::"memory"); 98 asm("":::"memory");
@@ -101,12 +101,12 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
101 return ret; 101 return ret;
102 case 16: 102 case 16:
103 __put_user_asm(*(u64 *)src, (u64 __user *)dst, 103 __put_user_asm(*(u64 *)src, (u64 __user *)dst,
104 ret, "q", "", "ir", 16); 104 ret, "q", "", "er", 16);
105 if (unlikely(ret)) 105 if (unlikely(ret))
106 return ret; 106 return ret;
107 asm("":::"memory"); 107 asm("":::"memory");
108 __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, 108 __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
109 ret, "q", "", "ir", 8); 109 ret, "q", "", "er", 8);
110 return ret; 110 return ret;
111 default: 111 default:
112 return copy_user_generic((__force void *)dst, src, size); 112 return copy_user_generic((__force void *)dst, src, size);
@@ -157,7 +157,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
157 ret, "q", "", "=r", 8); 157 ret, "q", "", "=r", 8);
158 if (likely(!ret)) 158 if (likely(!ret))
159 __put_user_asm(tmp, (u64 __user *)dst, 159 __put_user_asm(tmp, (u64 __user *)dst,
160 ret, "q", "", "ir", 8); 160 ret, "q", "", "er", 8);
161 return ret; 161 return ret;
162 } 162 }
163 default: 163 default:
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 341070f7ad5c..77a68505419a 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -175,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
175#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) 175#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
176 176
177#define UV_GLOBAL_MMR64_PNODE_BITS(p) \ 177#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
178 ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) 178 (((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
179 179
180#define UV_APIC_PNODE_SHIFT 6 180#define UV_APIC_PNODE_SHIFT 6
181 181
@@ -327,6 +327,7 @@ struct uv_blade_info {
327 unsigned short nr_possible_cpus; 327 unsigned short nr_possible_cpus;
328 unsigned short nr_online_cpus; 328 unsigned short nr_online_cpus;
329 unsigned short pnode; 329 unsigned short pnode;
330 short memory_nid;
330}; 331};
331extern struct uv_blade_info *uv_blade_info; 332extern struct uv_blade_info *uv_blade_info;
332extern short *uv_node_to_blade; 333extern short *uv_node_to_blade;
@@ -363,6 +364,12 @@ static inline int uv_blade_to_pnode(int bid)
363 return uv_blade_info[bid].pnode; 364 return uv_blade_info[bid].pnode;
364} 365}
365 366
367/* Nid of memory node on blade. -1 if no blade-local memory */
368static inline int uv_blade_to_memory_nid(int bid)
369{
370 return uv_blade_info[bid].memory_nid;
371}
372
366/* Determine the number of possible cpus on a blade */ 373/* Determine the number of possible cpus on a blade */
367static inline int uv_blade_nr_possible_cpus(int bid) 374static inline int uv_blade_nr_possible_cpus(int bid)
368{ 375{
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 6c327b852e23..430d5b24af7b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -26,6 +26,8 @@ CFLAGS_tsc.o := $(nostackp)
26CFLAGS_paravirt.o := $(nostackp) 26CFLAGS_paravirt.o := $(nostackp)
27GCOV_PROFILE_vsyscall_64.o := n 27GCOV_PROFILE_vsyscall_64.o := n
28GCOV_PROFILE_hpet.o := n 28GCOV_PROFILE_hpet.o := n
29GCOV_PROFILE_tsc.o := n
30GCOV_PROFILE_paravirt.o := n
29 31
30obj-y := process_$(BITS).o signal.o entry_$(BITS).o 32obj-y := process_$(BITS).o signal.o entry_$(BITS).o
31obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 33obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 9372f0406ad4..6c99f5037801 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1192,7 +1192,7 @@ out:
1192 return 0; 1192 return 0;
1193} 1193}
1194 1194
1195struct notifier_block device_nb = { 1195static struct notifier_block device_nb = {
1196 .notifier_call = device_change_notifier, 1196 .notifier_call = device_change_notifier,
1197}; 1197};
1198 1198
@@ -1763,7 +1763,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
1763 flag |= __GFP_ZERO; 1763 flag |= __GFP_ZERO;
1764 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 1764 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1765 if (!virt_addr) 1765 if (!virt_addr)
1766 return 0; 1766 return NULL;
1767 1767
1768 paddr = virt_to_phys(virt_addr); 1768 paddr = virt_to_phys(virt_addr);
1769 1769
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 10b2accd12ea..c1b17e97252e 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -472,6 +472,8 @@ static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
472 if (iommu->evt_buf == NULL) 472 if (iommu->evt_buf == NULL)
473 return NULL; 473 return NULL;
474 474
475 iommu->evt_buf_size = EVT_BUFFER_SIZE;
476
475 return iommu->evt_buf; 477 return iommu->evt_buf;
476} 478}
477 479
@@ -691,6 +693,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
691 693
692 devid = e->devid; 694 devid = e->devid;
693 devid_to = e->ext >> 8; 695 devid_to = e->ext >> 8;
696 set_dev_entry_from_acpi(iommu, devid , e->flags, 0);
694 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); 697 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
695 amd_iommu_alias_table[devid] = devid_to; 698 amd_iommu_alias_table[devid] = devid_to;
696 break; 699 break;
@@ -749,11 +752,13 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
749 752
750 devid = e->devid; 753 devid = e->devid;
751 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 754 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
752 if (alias) 755 if (alias) {
753 amd_iommu_alias_table[dev_i] = devid_to; 756 amd_iommu_alias_table[dev_i] = devid_to;
754 set_dev_entry_from_acpi(iommu, 757 set_dev_entry_from_acpi(iommu,
755 amd_iommu_alias_table[dev_i], 758 devid_to, flags, ext_flags);
756 flags, ext_flags); 759 }
760 set_dev_entry_from_acpi(iommu, dev_i,
761 flags, ext_flags);
757 } 762 }
758 break; 763 break;
759 default: 764 default:
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8c7c042ecad1..0a1c2830ec66 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -140,7 +140,6 @@ int x2apic_mode;
140#ifdef CONFIG_X86_X2APIC 140#ifdef CONFIG_X86_X2APIC
141/* x2apic enabled before OS handover */ 141/* x2apic enabled before OS handover */
142static int x2apic_preenabled; 142static int x2apic_preenabled;
143static int disable_x2apic;
144static __init int setup_nox2apic(char *str) 143static __init int setup_nox2apic(char *str)
145{ 144{
146 if (x2apic_enabled()) { 145 if (x2apic_enabled()) {
@@ -149,7 +148,6 @@ static __init int setup_nox2apic(char *str)
149 return 0; 148 return 0;
150 } 149 }
151 150
152 disable_x2apic = 1;
153 setup_clear_cpu_cap(X86_FEATURE_X2APIC); 151 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
154 return 0; 152 return 0;
155} 153}
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 69328ac8de9c..8952a5890281 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -652,7 +652,8 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
652 return ret && es7000_apic_is_cluster(); 652 return ret && es7000_apic_is_cluster();
653} 653}
654 654
655struct apic apic_es7000_cluster = { 655/* We've been warned by a false positive warning.Use __refdata to keep calm. */
656struct apic __refdata apic_es7000_cluster = {
656 657
657 .name = "es7000", 658 .name = "es7000",
658 .probe = probe_es7000, 659 .probe = probe_es7000,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 4d0216fcb36c..d2ed6c5ddc80 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1716,25 +1716,19 @@ __apicdebuginit(void) print_IO_APIC(void)
1716 return; 1716 return;
1717} 1717}
1718 1718
1719__apicdebuginit(void) print_APIC_bitfield(int base) 1719__apicdebuginit(void) print_APIC_field(int base)
1720{ 1720{
1721 unsigned int v; 1721 int i;
1722 int i, j;
1723 1722
1724 if (apic_verbosity == APIC_QUIET) 1723 if (apic_verbosity == APIC_QUIET)
1725 return; 1724 return;
1726 1725
1727 printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); 1726 printk(KERN_DEBUG);
1728 for (i = 0; i < 8; i++) { 1727
1729 v = apic_read(base + i*0x10); 1728 for (i = 0; i < 8; i++)
1730 for (j = 0; j < 32; j++) { 1729 printk(KERN_CONT "%08x", apic_read(base + i*0x10));
1731 if (v & (1<<j)) 1730
1732 printk("1"); 1731 printk(KERN_CONT "\n");
1733 else
1734 printk("0");
1735 }
1736 printk("\n");
1737 }
1738} 1732}
1739 1733
1740__apicdebuginit(void) print_local_APIC(void *dummy) 1734__apicdebuginit(void) print_local_APIC(void *dummy)
@@ -1745,7 +1739,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1745 if (apic_verbosity == APIC_QUIET) 1739 if (apic_verbosity == APIC_QUIET)
1746 return; 1740 return;
1747 1741
1748 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1742 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1749 smp_processor_id(), hard_smp_processor_id()); 1743 smp_processor_id(), hard_smp_processor_id());
1750 v = apic_read(APIC_ID); 1744 v = apic_read(APIC_ID);
1751 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); 1745 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
@@ -1786,11 +1780,11 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1786 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); 1780 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1787 1781
1788 printk(KERN_DEBUG "... APIC ISR field:\n"); 1782 printk(KERN_DEBUG "... APIC ISR field:\n");
1789 print_APIC_bitfield(APIC_ISR); 1783 print_APIC_field(APIC_ISR);
1790 printk(KERN_DEBUG "... APIC TMR field:\n"); 1784 printk(KERN_DEBUG "... APIC TMR field:\n");
1791 print_APIC_bitfield(APIC_TMR); 1785 print_APIC_field(APIC_TMR);
1792 printk(KERN_DEBUG "... APIC IRR field:\n"); 1786 printk(KERN_DEBUG "... APIC IRR field:\n");
1793 print_APIC_bitfield(APIC_IRR); 1787 print_APIC_field(APIC_IRR);
1794 1788
1795 if (APIC_INTEGRATED(ver)) { /* !82489DX */ 1789 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1796 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 1790 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
@@ -3799,6 +3793,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3799 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3793 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3800 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 3794 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3801 3795
3796 if (cfg->move_in_progress)
3797 send_cleanup_vector(cfg);
3798
3802 return irq; 3799 return irq;
3803} 3800}
3804 3801
@@ -4187,28 +4184,20 @@ fake_ioapic_page:
4187 } 4184 }
4188} 4185}
4189 4186
4190static int __init ioapic_insert_resources(void) 4187void __init ioapic_insert_resources(void)
4191{ 4188{
4192 int i; 4189 int i;
4193 struct resource *r = ioapic_resources; 4190 struct resource *r = ioapic_resources;
4194 4191
4195 if (!r) { 4192 if (!r) {
4196 if (nr_ioapics > 0) { 4193 if (nr_ioapics > 0)
4197 printk(KERN_ERR 4194 printk(KERN_ERR
4198 "IO APIC resources couldn't be allocated.\n"); 4195 "IO APIC resources couldn't be allocated.\n");
4199 return -1; 4196 return;
4200 }
4201 return 0;
4202 } 4197 }
4203 4198
4204 for (i = 0; i < nr_ioapics; i++) { 4199 for (i = 0; i < nr_ioapics; i++) {
4205 insert_resource(&iomem_resource, r); 4200 insert_resource(&iomem_resource, r);
4206 r++; 4201 r++;
4207 } 4202 }
4208
4209 return 0;
4210} 4203}
4211
4212/* Insert the IO APIC resources after PCI initialization has occured to handle
4213 * IO APICS that are mapped in on a BAR in PCI space. */
4214late_initcall(ioapic_insert_resources);
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 533e59c6fc82..ca96e68f0d23 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -493,7 +493,8 @@ static void numaq_setup_portio_remap(void)
493 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); 493 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
494} 494}
495 495
496struct apic apic_numaq = { 496/* Use __refdata to keep false positive warning calm. */
497struct apic __refdata apic_numaq = {
497 498
498 .name = "NUMAQ", 499 .name = "NUMAQ",
499 .probe = probe_numaq, 500 .probe = probe_numaq,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8e4cbb255c38..2ed4e2bb3b32 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -170,7 +170,7 @@ static unsigned long set_apic_id(unsigned int id)
170 170
171static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) 171static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
172{ 172{
173 return current_cpu_data.initial_apicid >> index_msb; 173 return initial_apicid >> index_msb;
174} 174}
175 175
176static void x2apic_send_IPI_self(int vector) 176static void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a284359627e7..0b631c6a2e00 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -162,7 +162,7 @@ static unsigned long set_apic_id(unsigned int id)
162 162
163static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) 163static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
164{ 164{
165 return current_cpu_data.initial_apicid >> index_msb; 165 return initial_apicid >> index_msb;
166} 166}
167 167
168static void x2apic_send_IPI_self(int vector) 168static void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 096d19aea2f7..832e908adcb5 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -261,7 +261,7 @@ struct apic apic_x2apic_uv_x = {
261 .apic_id_registered = uv_apic_id_registered, 261 .apic_id_registered = uv_apic_id_registered,
262 262
263 .irq_delivery_mode = dest_Fixed, 263 .irq_delivery_mode = dest_Fixed,
264 .irq_dest_mode = 1, /* logical */ 264 .irq_dest_mode = 0, /* physical */
265 265
266 .target_cpus = uv_target_cpus, 266 .target_cpus = uv_target_cpus,
267 .disable_esr = 0, 267 .disable_esr = 0,
@@ -362,12 +362,6 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
362 BUG(); 362 BUG();
363} 363}
364 364
365static __init void map_low_mmrs(void)
366{
367 init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
368 init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
369}
370
371enum map_type {map_wb, map_uc}; 365enum map_type {map_wb, map_uc};
372 366
373static __init void map_high(char *id, unsigned long base, int shift, 367static __init void map_high(char *id, unsigned long base, int shift,
@@ -395,26 +389,6 @@ static __init void map_gru_high(int max_pnode)
395 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
396} 390}
397 391
398static __init void map_config_high(int max_pnode)
399{
400 union uvh_rh_gam_cfg_overlay_config_mmr_u cfg;
401 int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT;
402
403 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
404 if (cfg.s.enable)
405 map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc);
406}
407
408static __init void map_mmr_high(int max_pnode)
409{
410 union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
411 int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
412
413 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
414 if (mmr.s.enable)
415 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
416}
417
418static __init void map_mmioh_high(int max_pnode) 392static __init void map_mmioh_high(int max_pnode)
419{ 393{
420 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; 394 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
@@ -566,8 +540,6 @@ void __init uv_system_init(void)
566 unsigned long mmr_base, present, paddr; 540 unsigned long mmr_base, present, paddr;
567 unsigned short pnode_mask; 541 unsigned short pnode_mask;
568 542
569 map_low_mmrs();
570
571 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 543 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
572 m_val = m_n_config.s.m_skt; 544 m_val = m_n_config.s.m_skt;
573 n_val = m_n_config.s.n_skt; 545 n_val = m_n_config.s.n_skt;
@@ -591,6 +563,8 @@ void __init uv_system_init(void)
591 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 563 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
592 uv_blade_info = kmalloc(bytes, GFP_KERNEL); 564 uv_blade_info = kmalloc(bytes, GFP_KERNEL);
593 BUG_ON(!uv_blade_info); 565 BUG_ON(!uv_blade_info);
566 for (blade = 0; blade < uv_num_possible_blades(); blade++)
567 uv_blade_info[blade].memory_nid = -1;
594 568
595 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); 569 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
596 570
@@ -629,6 +603,9 @@ void __init uv_system_init(void)
629 lcpu = uv_blade_info[blade].nr_possible_cpus; 603 lcpu = uv_blade_info[blade].nr_possible_cpus;
630 uv_blade_info[blade].nr_possible_cpus++; 604 uv_blade_info[blade].nr_possible_cpus++;
631 605
606 /* Any node on the blade, else will contain -1. */
607 uv_blade_info[blade].memory_nid = nid;
608
632 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; 609 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
633 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; 610 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
634 uv_cpu_hub_info(cpu)->m_val = m_val; 611 uv_cpu_hub_info(cpu)->m_val = m_val;
@@ -662,11 +639,10 @@ void __init uv_system_init(void)
662 pnode = (paddr >> m_val) & pnode_mask; 639 pnode = (paddr >> m_val) & pnode_mask;
663 blade = boot_pnode_to_blade(pnode); 640 blade = boot_pnode_to_blade(pnode);
664 uv_node_to_blade[nid] = blade; 641 uv_node_to_blade[nid] = blade;
642 max_pnode = max(pnode, max_pnode);
665 } 643 }
666 644
667 map_gru_high(max_pnode); 645 map_gru_high(max_pnode);
668 map_mmr_high(max_pnode);
669 map_config_high(max_pnode);
670 map_mmioh_high(max_pnode); 646 map_mmioh_high(max_pnode);
671 647
672 uv_cpu_init(); 648 uv_cpu_init();
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 79302e9a33a4..442b5508893f 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -811,7 +811,7 @@ static int apm_do_idle(void)
811 u8 ret = 0; 811 u8 ret = 0;
812 int idled = 0; 812 int idled = 0;
813 int polling; 813 int polling;
814 int err; 814 int err = 0;
815 815
816 polling = !!(current_thread_info()->status & TS_POLLING); 816 polling = !!(current_thread_info()->status & TS_POLLING);
817 if (polling) { 817 if (polling) {
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e5b27d8f1b47..e2485b03f1cf 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -258,13 +258,15 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
258{ 258{
259#ifdef CONFIG_X86_HT 259#ifdef CONFIG_X86_HT
260 unsigned bits; 260 unsigned bits;
261 int cpu = smp_processor_id();
261 262
262 bits = c->x86_coreid_bits; 263 bits = c->x86_coreid_bits;
263
264 /* Low order bits define the core id (index of core in socket) */ 264 /* Low order bits define the core id (index of core in socket) */
265 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); 265 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
266 /* Convert the initial APIC ID into the socket ID */ 266 /* Convert the initial APIC ID into the socket ID */
267 c->phys_proc_id = c->initial_apicid >> bits; 267 c->phys_proc_id = c->initial_apicid >> bits;
268 /* use socket ID also for last level cache */
269 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
268#endif 270#endif
269} 271}
270 272
@@ -354,7 +356,7 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
354#endif 356#endif
355#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) 357#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
356 /* check CPU config space for extended APIC ID */ 358 /* check CPU config space for extended APIC ID */
357 if (c->x86 >= 0xf) { 359 if (cpu_has_apic && c->x86 >= 0xf) {
358 unsigned int val; 360 unsigned int val;
359 val = read_pci_config(0, 24, 0, 0x68); 361 val = read_pci_config(0, 24, 0, 0x68);
360 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) 362 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 6b26d4deada0..f1961c07af9a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -848,9 +848,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
848#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 848#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
849 numa_add_cpu(smp_processor_id()); 849 numa_add_cpu(smp_processor_id());
850#endif 850#endif
851
852 /* Cap the iomem address space to what is addressable on all CPUs */
853 iomem_resource.end &= (1ULL << c->x86_phys_bits) - 1;
854} 851}
855 852
856#ifdef CONFIG_X86_64 853#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 81cbe64ed6b4..2a50ef891000 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -299,7 +299,7 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
299static int transition_fid_vid(struct powernow_k8_data *data, 299static int transition_fid_vid(struct powernow_k8_data *data,
300 u32 reqfid, u32 reqvid) 300 u32 reqfid, u32 reqvid)
301{ 301{
302 if (core_voltage_pre_transition(data, reqvid)) 302 if (core_voltage_pre_transition(data, reqvid, reqfid))
303 return 1; 303 return 1;
304 304
305 if (core_frequency_transition(data, reqfid)) 305 if (core_frequency_transition(data, reqfid))
@@ -327,17 +327,20 @@ static int transition_fid_vid(struct powernow_k8_data *data,
327 327
328/* Phase 1 - core voltage transition ... setup voltage */ 328/* Phase 1 - core voltage transition ... setup voltage */
329static int core_voltage_pre_transition(struct powernow_k8_data *data, 329static int core_voltage_pre_transition(struct powernow_k8_data *data,
330 u32 reqvid) 330 u32 reqvid, u32 reqfid)
331{ 331{
332 u32 rvosteps = data->rvo; 332 u32 rvosteps = data->rvo;
333 u32 savefid = data->currfid; 333 u32 savefid = data->currfid;
334 u32 maxvid, lo; 334 u32 maxvid, lo, rvomult = 1;
335 335
336 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " 336 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
337 "reqvid 0x%x, rvo 0x%x\n", 337 "reqvid 0x%x, rvo 0x%x\n",
338 smp_processor_id(), 338 smp_processor_id(),
339 data->currfid, data->currvid, reqvid, data->rvo); 339 data->currfid, data->currvid, reqvid, data->rvo);
340 340
341 if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
342 rvomult = 2;
343 rvosteps *= rvomult;
341 rdmsr(MSR_FIDVID_STATUS, lo, maxvid); 344 rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
342 maxvid = 0x1f & (maxvid >> 16); 345 maxvid = 0x1f & (maxvid >> 16);
343 dprintk("ph1 maxvid=0x%x\n", maxvid); 346 dprintk("ph1 maxvid=0x%x\n", maxvid);
@@ -351,7 +354,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data,
351 return 1; 354 return 1;
352 } 355 }
353 356
354 while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) { 357 while ((rvosteps > 0) &&
358 ((rvomult * data->rvo + data->currvid) > reqvid)) {
355 if (data->currvid == maxvid) { 359 if (data->currvid == maxvid) {
356 rvosteps = 0; 360 rvosteps = 0;
357 } else { 361 } else {
@@ -384,13 +388,6 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
384 u32 vcoreqfid, vcocurrfid, vcofiddiff; 388 u32 vcoreqfid, vcocurrfid, vcofiddiff;
385 u32 fid_interval, savevid = data->currvid; 389 u32 fid_interval, savevid = data->currvid;
386 390
387 if ((reqfid < HI_FID_TABLE_BOTTOM) &&
388 (data->currfid < HI_FID_TABLE_BOTTOM)) {
389 printk(KERN_ERR PFX "ph2: illegal lo-lo transition "
390 "0x%x 0x%x\n", reqfid, data->currfid);
391 return 1;
392 }
393
394 if (data->currfid == reqfid) { 391 if (data->currfid == reqfid) {
395 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", 392 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
396 data->currfid); 393 data->currfid);
@@ -407,6 +404,9 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
407 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid 404 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
408 : vcoreqfid - vcocurrfid; 405 : vcoreqfid - vcocurrfid;
409 406
407 if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
408 vcofiddiff = 0;
409
410 while (vcofiddiff > 2) { 410 while (vcofiddiff > 2) {
411 (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); 411 (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
412 412
@@ -1081,14 +1081,6 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,
1081 return 0; 1081 return 0;
1082 } 1082 }
1083 1083
1084 if ((fid < HI_FID_TABLE_BOTTOM) &&
1085 (data->currfid < HI_FID_TABLE_BOTTOM)) {
1086 printk(KERN_ERR PFX
1087 "ignoring illegal change in lo freq table-%x to 0x%x\n",
1088 data->currfid, fid);
1089 return 1;
1090 }
1091
1092 dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", 1084 dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
1093 smp_processor_id(), fid, vid); 1085 smp_processor_id(), fid, vid);
1094 freqs.old = find_khz_freq_from_fid(data->currfid); 1086 freqs.old = find_khz_freq_from_fid(data->currfid);
@@ -1267,7 +1259,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1267{ 1259{
1268 static const char ACPI_PSS_BIOS_BUG_MSG[] = 1260 static const char ACPI_PSS_BIOS_BUG_MSG[] =
1269 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" 1261 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
1270 KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; 1262 FW_BUG PFX "Try again with latest BIOS.\n";
1271 struct powernow_k8_data *data; 1263 struct powernow_k8_data *data;
1272 struct init_on_cpu init_on_cpu; 1264 struct init_on_cpu init_on_cpu;
1273 int rc; 1265 int rc;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index c9c1190b5e1f..02ce824073cb 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -215,7 +215,8 @@ struct pst_s {
215 215
216#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) 216#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
217 217
218static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid); 218static int core_voltage_pre_transition(struct powernow_k8_data *data,
219 u32 reqvid, u32 regfid);
219static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); 220static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
220static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); 221static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
221 222
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 284d1de968bc..1cfb623ce11c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -194,14 +194,14 @@ static void print_mce(struct mce *m)
194 m->cs, m->ip); 194 m->cs, m->ip);
195 if (m->cs == __KERNEL_CS) 195 if (m->cs == __KERNEL_CS)
196 print_symbol("{%s}", m->ip); 196 print_symbol("{%s}", m->ip);
197 printk("\n"); 197 printk(KERN_CONT "\n");
198 } 198 }
199 printk(KERN_EMERG "TSC %llx ", m->tsc); 199 printk(KERN_EMERG "TSC %llx ", m->tsc);
200 if (m->addr) 200 if (m->addr)
201 printk("ADDR %llx ", m->addr); 201 printk(KERN_CONT "ADDR %llx ", m->addr);
202 if (m->misc) 202 if (m->misc)
203 printk("MISC %llx ", m->misc); 203 printk(KERN_CONT "MISC %llx ", m->misc);
204 printk("\n"); 204 printk(KERN_CONT "\n");
205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 m->cpuvendor, m->cpuid, m->time, m->socketid, 206 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 m->apicid); 207 m->apicid);
@@ -209,13 +209,13 @@ static void print_mce(struct mce *m)
209 209
210static void print_mce_head(void) 210static void print_mce_head(void)
211{ 211{
212 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n"); 212 printk(KERN_EMERG "\nHARDWARE ERROR\n");
213} 213}
214 214
215static void print_mce_tail(void) 215static void print_mce_tail(void)
216{ 216{
217 printk(KERN_EMERG "This is not a software problem!\n" 217 printk(KERN_EMERG "This is not a software problem!\n"
218 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n"); 218 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
219} 219}
220 220
221#define PANIC_TIMEOUT 5 /* 5 seconds */ 221#define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -1117,7 +1117,7 @@ static void mcheck_timer(unsigned long data)
1117 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); 1117 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1118 1118
1119 t->expires = jiffies + *n; 1119 t->expires = jiffies + *n;
1120 add_timer(t); 1120 add_timer_on(t, smp_processor_id());
1121} 1121}
1122 1122
1123static void mce_do_trigger(struct work_struct *work) 1123static void mce_do_trigger(struct work_struct *work)
@@ -1321,7 +1321,7 @@ static void mce_init_timer(void)
1321 return; 1321 return;
1322 setup_timer(t, mcheck_timer, smp_processor_id()); 1322 setup_timer(t, mcheck_timer, smp_processor_id());
1323 t->expires = round_jiffies(jiffies + *n); 1323 t->expires = round_jiffies(jiffies + *n);
1324 add_timer(t); 1324 add_timer_on(t, smp_processor_id());
1325} 1325}
1326 1326
1327/* 1327/*
@@ -1692,17 +1692,15 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1692 const char *buf, size_t siz) 1692 const char *buf, size_t siz)
1693{ 1693{
1694 char *p; 1694 char *p;
1695 int len;
1696 1695
1697 strncpy(mce_helper, buf, sizeof(mce_helper)); 1696 strncpy(mce_helper, buf, sizeof(mce_helper));
1698 mce_helper[sizeof(mce_helper)-1] = 0; 1697 mce_helper[sizeof(mce_helper)-1] = 0;
1699 len = strlen(mce_helper);
1700 p = strchr(mce_helper, '\n'); 1698 p = strchr(mce_helper, '\n');
1701 1699
1702 if (*p) 1700 if (p)
1703 *p = 0; 1701 *p = 0;
1704 1702
1705 return len; 1703 return strlen(mce_helper) + !!p;
1706} 1704}
1707 1705
1708static ssize_t set_ignore_ce(struct sys_device *s, 1706static ssize_t set_ignore_ce(struct sys_device *s,
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 76dfef23f789..a7aa8f900954 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -66,6 +66,52 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
66}; 66};
67 67
68/* 68/*
69 * Not sure about some of these
70 */
71static const u64 p6_perfmon_event_map[] =
72{
73 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
74 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
75 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0000,
76 [PERF_COUNT_HW_CACHE_MISSES] = 0x0000,
77 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
78 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
79 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
80};
81
82static u64 p6_pmu_event_map(int event)
83{
84 return p6_perfmon_event_map[event];
85}
86
87/*
88 * Counter setting that is specified not to count anything.
89 * We use this to effectively disable a counter.
90 *
91 * L2_RQSTS with 0 MESI unit mask.
92 */
93#define P6_NOP_COUNTER 0x0000002EULL
94
95static u64 p6_pmu_raw_event(u64 event)
96{
97#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
98#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
99#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
100#define P6_EVNTSEL_INV_MASK 0x00800000ULL
101#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL
102
103#define P6_EVNTSEL_MASK \
104 (P6_EVNTSEL_EVENT_MASK | \
105 P6_EVNTSEL_UNIT_MASK | \
106 P6_EVNTSEL_EDGE_MASK | \
107 P6_EVNTSEL_INV_MASK | \
108 P6_EVNTSEL_COUNTER_MASK)
109
110 return event & P6_EVNTSEL_MASK;
111}
112
113
114/*
69 * Intel PerfMon v3. Used on Core2 and later. 115 * Intel PerfMon v3. Used on Core2 and later.
70 */ 116 */
71static const u64 intel_perfmon_event_map[] = 117static const u64 intel_perfmon_event_map[] =
@@ -401,7 +447,7 @@ static const u64 amd_hw_cache_event_ids
401 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ 447 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
402 }, 448 },
403 [ C(OP_WRITE) ] = { 449 [ C(OP_WRITE) ] = {
404 [ C(RESULT_ACCESS) ] = 0x0042, /* Data Cache Refills from L2 */ 450 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
405 [ C(RESULT_MISS) ] = 0, 451 [ C(RESULT_MISS) ] = 0,
406 }, 452 },
407 [ C(OP_PREFETCH) ] = { 453 [ C(OP_PREFETCH) ] = {
@@ -666,6 +712,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
666{ 712{
667 struct perf_counter_attr *attr = &counter->attr; 713 struct perf_counter_attr *attr = &counter->attr;
668 struct hw_perf_counter *hwc = &counter->hw; 714 struct hw_perf_counter *hwc = &counter->hw;
715 u64 config;
669 int err; 716 int err;
670 717
671 if (!x86_pmu_initialized()) 718 if (!x86_pmu_initialized())
@@ -718,14 +765,40 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
718 765
719 if (attr->config >= x86_pmu.max_events) 766 if (attr->config >= x86_pmu.max_events)
720 return -EINVAL; 767 return -EINVAL;
768
721 /* 769 /*
722 * The generic map: 770 * The generic map:
723 */ 771 */
724 hwc->config |= x86_pmu.event_map(attr->config); 772 config = x86_pmu.event_map(attr->config);
773
774 if (config == 0)
775 return -ENOENT;
776
777 if (config == -1LL)
778 return -EINVAL;
779
780 hwc->config |= config;
725 781
726 return 0; 782 return 0;
727} 783}
728 784
785static void p6_pmu_disable_all(void)
786{
787 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
788 u64 val;
789
790 if (!cpuc->enabled)
791 return;
792
793 cpuc->enabled = 0;
794 barrier();
795
796 /* p6 only has one enable register */
797 rdmsrl(MSR_P6_EVNTSEL0, val);
798 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
799 wrmsrl(MSR_P6_EVNTSEL0, val);
800}
801
729static void intel_pmu_disable_all(void) 802static void intel_pmu_disable_all(void)
730{ 803{
731 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 804 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
@@ -767,6 +840,23 @@ void hw_perf_disable(void)
767 return x86_pmu.disable_all(); 840 return x86_pmu.disable_all();
768} 841}
769 842
843static void p6_pmu_enable_all(void)
844{
845 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
846 unsigned long val;
847
848 if (cpuc->enabled)
849 return;
850
851 cpuc->enabled = 1;
852 barrier();
853
854 /* p6 only has one enable register */
855 rdmsrl(MSR_P6_EVNTSEL0, val);
856 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
857 wrmsrl(MSR_P6_EVNTSEL0, val);
858}
859
770static void intel_pmu_enable_all(void) 860static void intel_pmu_enable_all(void)
771{ 861{
772 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 862 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
@@ -784,13 +874,13 @@ static void amd_pmu_enable_all(void)
784 barrier(); 874 barrier();
785 875
786 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 876 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
877 struct perf_counter *counter = cpuc->counters[idx];
787 u64 val; 878 u64 val;
788 879
789 if (!test_bit(idx, cpuc->active_mask)) 880 if (!test_bit(idx, cpuc->active_mask))
790 continue; 881 continue;
791 rdmsrl(MSR_K7_EVNTSEL0 + idx, val); 882
792 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) 883 val = counter->hw.config;
793 continue;
794 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 884 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
795 wrmsrl(MSR_K7_EVNTSEL0 + idx, val); 885 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
796 } 886 }
@@ -819,16 +909,13 @@ static inline void intel_pmu_ack_status(u64 ack)
819 909
820static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 910static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
821{ 911{
822 int err; 912 (void)checking_wrmsrl(hwc->config_base + idx,
823 err = checking_wrmsrl(hwc->config_base + idx,
824 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); 913 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
825} 914}
826 915
827static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 916static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
828{ 917{
829 int err; 918 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
830 err = checking_wrmsrl(hwc->config_base + idx,
831 hwc->config);
832} 919}
833 920
834static inline void 921static inline void
@@ -836,13 +923,24 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
836{ 923{
837 int idx = __idx - X86_PMC_IDX_FIXED; 924 int idx = __idx - X86_PMC_IDX_FIXED;
838 u64 ctrl_val, mask; 925 u64 ctrl_val, mask;
839 int err;
840 926
841 mask = 0xfULL << (idx * 4); 927 mask = 0xfULL << (idx * 4);
842 928
843 rdmsrl(hwc->config_base, ctrl_val); 929 rdmsrl(hwc->config_base, ctrl_val);
844 ctrl_val &= ~mask; 930 ctrl_val &= ~mask;
845 err = checking_wrmsrl(hwc->config_base, ctrl_val); 931 (void)checking_wrmsrl(hwc->config_base, ctrl_val);
932}
933
934static inline void
935p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
936{
937 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
938 u64 val = P6_NOP_COUNTER;
939
940 if (cpuc->enabled)
941 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
942
943 (void)checking_wrmsrl(hwc->config_base + idx, val);
846} 944}
847 945
848static inline void 946static inline void
@@ -912,6 +1010,8 @@ x86_perf_counter_set_period(struct perf_counter *counter,
912 err = checking_wrmsrl(hwc->counter_base + idx, 1010 err = checking_wrmsrl(hwc->counter_base + idx,
913 (u64)(-left) & x86_pmu.counter_mask); 1011 (u64)(-left) & x86_pmu.counter_mask);
914 1012
1013 perf_counter_update_userpage(counter);
1014
915 return ret; 1015 return ret;
916} 1016}
917 1017
@@ -941,6 +1041,19 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
941 err = checking_wrmsrl(hwc->config_base, ctrl_val); 1041 err = checking_wrmsrl(hwc->config_base, ctrl_val);
942} 1042}
943 1043
1044static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
1045{
1046 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1047 u64 val;
1048
1049 val = hwc->config;
1050 if (cpuc->enabled)
1051 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
1052
1053 (void)checking_wrmsrl(hwc->config_base + idx, val);
1054}
1055
1056
944static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1057static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
945{ 1058{
946 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 1059 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -957,8 +1070,6 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
957 1070
958 if (cpuc->enabled) 1071 if (cpuc->enabled)
959 x86_pmu_enable_counter(hwc, idx); 1072 x86_pmu_enable_counter(hwc, idx);
960 else
961 x86_pmu_disable_counter(hwc, idx);
962} 1073}
963 1074
964static int 1075static int
@@ -969,13 +1080,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
969 if (!x86_pmu.num_counters_fixed) 1080 if (!x86_pmu.num_counters_fixed)
970 return -1; 1081 return -1;
971 1082
972 /*
973 * Quirk, IA32_FIXED_CTRs do not work on current Atom processors:
974 */
975 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
976 boot_cpu_data.x86_model == 28)
977 return -1;
978
979 event = hwc->config & ARCH_PERFMON_EVENT_MASK; 1083 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
980 1084
981 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 1085 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
@@ -1041,6 +1145,8 @@ try_generic:
1041 x86_perf_counter_set_period(counter, hwc, idx); 1145 x86_perf_counter_set_period(counter, hwc, idx);
1042 x86_pmu.enable(hwc, idx); 1146 x86_pmu.enable(hwc, idx);
1043 1147
1148 perf_counter_update_userpage(counter);
1149
1044 return 0; 1150 return 0;
1045} 1151}
1046 1152
@@ -1133,6 +1239,8 @@ static void x86_pmu_disable(struct perf_counter *counter)
1133 x86_perf_counter_update(counter, hwc, idx); 1239 x86_perf_counter_update(counter, hwc, idx);
1134 cpuc->counters[idx] = NULL; 1240 cpuc->counters[idx] = NULL;
1135 clear_bit(idx, cpuc->used_mask); 1241 clear_bit(idx, cpuc->used_mask);
1242
1243 perf_counter_update_userpage(counter);
1136} 1244}
1137 1245
1138/* 1246/*
@@ -1177,6 +1285,49 @@ static void intel_pmu_reset(void)
1177 local_irq_restore(flags); 1285 local_irq_restore(flags);
1178} 1286}
1179 1287
1288static int p6_pmu_handle_irq(struct pt_regs *regs)
1289{
1290 struct perf_sample_data data;
1291 struct cpu_hw_counters *cpuc;
1292 struct perf_counter *counter;
1293 struct hw_perf_counter *hwc;
1294 int idx, handled = 0;
1295 u64 val;
1296
1297 data.regs = regs;
1298 data.addr = 0;
1299
1300 cpuc = &__get_cpu_var(cpu_hw_counters);
1301
1302 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1303 if (!test_bit(idx, cpuc->active_mask))
1304 continue;
1305
1306 counter = cpuc->counters[idx];
1307 hwc = &counter->hw;
1308
1309 val = x86_perf_counter_update(counter, hwc, idx);
1310 if (val & (1ULL << (x86_pmu.counter_bits - 1)))
1311 continue;
1312
1313 /*
1314 * counter overflow
1315 */
1316 handled = 1;
1317 data.period = counter->hw.last_period;
1318
1319 if (!x86_perf_counter_set_period(counter, hwc, idx))
1320 continue;
1321
1322 if (perf_counter_overflow(counter, 1, &data))
1323 p6_pmu_disable_counter(hwc, idx);
1324 }
1325
1326 if (handled)
1327 inc_irq_stat(apic_perf_irqs);
1328
1329 return handled;
1330}
1180 1331
1181/* 1332/*
1182 * This handler is triggered by the local APIC, so the APIC IRQ handling 1333 * This handler is triggered by the local APIC, so the APIC IRQ handling
@@ -1186,14 +1337,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1186{ 1337{
1187 struct perf_sample_data data; 1338 struct perf_sample_data data;
1188 struct cpu_hw_counters *cpuc; 1339 struct cpu_hw_counters *cpuc;
1189 int bit, cpu, loops; 1340 int bit, loops;
1190 u64 ack, status; 1341 u64 ack, status;
1191 1342
1192 data.regs = regs; 1343 data.regs = regs;
1193 data.addr = 0; 1344 data.addr = 0;
1194 1345
1195 cpu = smp_processor_id(); 1346 cpuc = &__get_cpu_var(cpu_hw_counters);
1196 cpuc = &per_cpu(cpu_hw_counters, cpu);
1197 1347
1198 perf_disable(); 1348 perf_disable();
1199 status = intel_pmu_get_status(); 1349 status = intel_pmu_get_status();
@@ -1250,14 +1400,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1250 struct cpu_hw_counters *cpuc; 1400 struct cpu_hw_counters *cpuc;
1251 struct perf_counter *counter; 1401 struct perf_counter *counter;
1252 struct hw_perf_counter *hwc; 1402 struct hw_perf_counter *hwc;
1253 int cpu, idx, handled = 0; 1403 int idx, handled = 0;
1254 u64 val; 1404 u64 val;
1255 1405
1256 data.regs = regs; 1406 data.regs = regs;
1257 data.addr = 0; 1407 data.addr = 0;
1258 1408
1259 cpu = smp_processor_id(); 1409 cpuc = &__get_cpu_var(cpu_hw_counters);
1260 cpuc = &per_cpu(cpu_hw_counters, cpu);
1261 1410
1262 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1411 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1263 if (!test_bit(idx, cpuc->active_mask)) 1412 if (!test_bit(idx, cpuc->active_mask))
@@ -1354,6 +1503,32 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
1354 .priority = 1 1503 .priority = 1
1355}; 1504};
1356 1505
1506static struct x86_pmu p6_pmu = {
1507 .name = "p6",
1508 .handle_irq = p6_pmu_handle_irq,
1509 .disable_all = p6_pmu_disable_all,
1510 .enable_all = p6_pmu_enable_all,
1511 .enable = p6_pmu_enable_counter,
1512 .disable = p6_pmu_disable_counter,
1513 .eventsel = MSR_P6_EVNTSEL0,
1514 .perfctr = MSR_P6_PERFCTR0,
1515 .event_map = p6_pmu_event_map,
1516 .raw_event = p6_pmu_raw_event,
1517 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
1518 .max_period = (1ULL << 31) - 1,
1519 .version = 0,
1520 .num_counters = 2,
1521 /*
1522 * Counters have 40 bits implemented. However they are designed such
1523 * that bits [32-39] are sign extensions of bit 31. As such the
1524 * effective width of a counter for P6-like PMU is 32 bits only.
1525 *
1526 * See IA-32 Intel Architecture Software developer manual Vol 3B
1527 */
1528 .counter_bits = 32,
1529 .counter_mask = (1ULL << 32) - 1,
1530};
1531
1357static struct x86_pmu intel_pmu = { 1532static struct x86_pmu intel_pmu = {
1358 .name = "Intel", 1533 .name = "Intel",
1359 .handle_irq = intel_pmu_handle_irq, 1534 .handle_irq = intel_pmu_handle_irq,
@@ -1393,6 +1568,37 @@ static struct x86_pmu amd_pmu = {
1393 .max_period = (1ULL << 47) - 1, 1568 .max_period = (1ULL << 47) - 1,
1394}; 1569};
1395 1570
1571static int p6_pmu_init(void)
1572{
1573 switch (boot_cpu_data.x86_model) {
1574 case 1:
1575 case 3: /* Pentium Pro */
1576 case 5:
1577 case 6: /* Pentium II */
1578 case 7:
1579 case 8:
1580 case 11: /* Pentium III */
1581 break;
1582 case 9:
1583 case 13:
1584 /* Pentium M */
1585 break;
1586 default:
1587 pr_cont("unsupported p6 CPU model %d ",
1588 boot_cpu_data.x86_model);
1589 return -ENODEV;
1590 }
1591
1592 if (!cpu_has_apic) {
1593 pr_info("no Local APIC, try rebooting with lapic");
1594 return -ENODEV;
1595 }
1596
1597 x86_pmu = p6_pmu;
1598
1599 return 0;
1600}
1601
1396static int intel_pmu_init(void) 1602static int intel_pmu_init(void)
1397{ 1603{
1398 union cpuid10_edx edx; 1604 union cpuid10_edx edx;
@@ -1401,8 +1607,14 @@ static int intel_pmu_init(void)
1401 unsigned int ebx; 1607 unsigned int ebx;
1402 int version; 1608 int version;
1403 1609
1404 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 1610 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
1611 /* check for P6 processor family */
1612 if (boot_cpu_data.x86 == 6) {
1613 return p6_pmu_init();
1614 } else {
1405 return -ENODEV; 1615 return -ENODEV;
1616 }
1617 }
1406 1618
1407 /* 1619 /*
1408 * Check whether the Architectural PerfMon supports 1620 * Check whether the Architectural PerfMon supports
@@ -1428,8 +1640,6 @@ static int intel_pmu_init(void)
1428 */ 1640 */
1429 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); 1641 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
1430 1642
1431 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
1432
1433 /* 1643 /*
1434 * Install the hw-cache-events table: 1644 * Install the hw-cache-events table:
1435 */ 1645 */
@@ -1499,21 +1709,22 @@ void __init init_hw_perf_counters(void)
1499 pr_cont("%s PMU driver.\n", x86_pmu.name); 1709 pr_cont("%s PMU driver.\n", x86_pmu.name);
1500 1710
1501 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { 1711 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1502 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1503 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", 1712 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1504 x86_pmu.num_counters, X86_PMC_MAX_GENERIC); 1713 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1714 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1505 } 1715 }
1506 perf_counter_mask = (1 << x86_pmu.num_counters) - 1; 1716 perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
1507 perf_max_counters = x86_pmu.num_counters; 1717 perf_max_counters = x86_pmu.num_counters;
1508 1718
1509 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { 1719 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1510 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1511 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", 1720 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1512 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); 1721 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1722 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1513 } 1723 }
1514 1724
1515 perf_counter_mask |= 1725 perf_counter_mask |=
1516 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; 1726 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1727 x86_pmu.intel_ctrl = perf_counter_mask;
1517 1728
1518 perf_counters_lapic_init(); 1729 perf_counters_lapic_init();
1519 register_die_notifier(&perf_counter_nmi_notifier); 1730 register_die_notifier(&perf_counter_nmi_notifier);
@@ -1563,6 +1774,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1563 1774
1564static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); 1775static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
1565static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); 1776static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
1777static DEFINE_PER_CPU(int, in_nmi_frame);
1566 1778
1567 1779
1568static void 1780static void
@@ -1578,7 +1790,9 @@ static void backtrace_warning(void *data, char *msg)
1578 1790
1579static int backtrace_stack(void *data, char *name) 1791static int backtrace_stack(void *data, char *name)
1580{ 1792{
1581 /* Process all stacks: */ 1793 per_cpu(in_nmi_frame, smp_processor_id()) =
1794 x86_is_stack_id(NMI_STACK, name);
1795
1582 return 0; 1796 return 0;
1583} 1797}
1584 1798
@@ -1586,6 +1800,9 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
1586{ 1800{
1587 struct perf_callchain_entry *entry = data; 1801 struct perf_callchain_entry *entry = data;
1588 1802
1803 if (per_cpu(in_nmi_frame, smp_processor_id()))
1804 return;
1805
1589 if (reliable) 1806 if (reliable)
1590 callchain_store(entry, addr); 1807 callchain_store(entry, addr);
1591} 1808}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 5c481f6205bf..e60ed740d2b3 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -803,8 +803,3 @@ int __kprobes lapic_wd_event(unsigned nmi_hz)
803 wd_ops->rearm(wd, nmi_hz); 803 wd_ops->rearm(wd, nmi_hz);
804 return 1; 804 return 1;
805} 805}
806
807int lapic_watchdog_ok(void)
808{
809 return wd_ops != NULL;
810}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 95ea5fa7d444..c8405718a4c3 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -22,6 +22,7 @@
22#include "dumpstack.h" 22#include "dumpstack.h"
23 23
24int panic_on_unrecovered_nmi; 24int panic_on_unrecovered_nmi;
25int panic_on_io_nmi;
25unsigned int code_bytes = 64; 26unsigned int code_bytes = 64;
26int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; 27int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
27static int die_counter; 28static int die_counter;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index d593cd1f58dc..bca5fba91c9e 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -19,6 +19,12 @@
19 19
20#include "dumpstack.h" 20#include "dumpstack.h"
21 21
22/* Just a stub for now */
23int x86_is_stack_id(int id, char *name)
24{
25 return 0;
26}
27
22void dump_trace(struct task_struct *task, struct pt_regs *regs, 28void dump_trace(struct task_struct *task, struct pt_regs *regs,
23 unsigned long *stack, unsigned long bp, 29 unsigned long *stack, unsigned long bp,
24 const struct stacktrace_ops *ops, void *data) 30 const struct stacktrace_ops *ops, void *data)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index d35db5993fd6..54b0a3276766 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -19,10 +19,8 @@
19 19
20#include "dumpstack.h" 20#include "dumpstack.h"
21 21
22static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 22
23 unsigned *usedp, char **idp) 23static char x86_stack_ids[][8] = {
24{
25 static char ids[][8] = {
26 [DEBUG_STACK - 1] = "#DB", 24 [DEBUG_STACK - 1] = "#DB",
27 [NMI_STACK - 1] = "NMI", 25 [NMI_STACK - 1] = "NMI",
28 [DOUBLEFAULT_STACK - 1] = "#DF", 26 [DOUBLEFAULT_STACK - 1] = "#DF",
@@ -33,6 +31,15 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
33 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 31 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
34#endif 32#endif
35 }; 33 };
34
35int x86_is_stack_id(int id, char *name)
36{
37 return x86_stack_ids[id - 1] == name;
38}
39
40static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
41 unsigned *usedp, char **idp)
42{
36 unsigned k; 43 unsigned k;
37 44
38 /* 45 /*
@@ -61,7 +68,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
61 if (*usedp & (1U << k)) 68 if (*usedp & (1U << k))
62 break; 69 break;
63 *usedp |= 1U << k; 70 *usedp |= 1U << k;
64 *idp = ids[k]; 71 *idp = x86_stack_ids[k];
65 return (unsigned long *)end; 72 return (unsigned long *)end;
66 } 73 }
67 /* 74 /*
@@ -81,12 +88,13 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
81 do { 88 do {
82 ++j; 89 ++j;
83 end -= EXCEPTION_STKSZ; 90 end -= EXCEPTION_STKSZ;
84 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); 91 x86_stack_ids[j][4] = '1' +
92 (j - N_EXCEPTION_STACKS);
85 } while (stack < end - EXCEPTION_STKSZ); 93 } while (stack < end - EXCEPTION_STKSZ);
86 if (*usedp & (1U << j)) 94 if (*usedp & (1U << j))
87 break; 95 break;
88 *usedp |= 1U << j; 96 *usedp |= 1U << j;
89 *idp = ids[j]; 97 *idp = x86_stack_ids[j];
90 return (unsigned long *)end; 98 return (unsigned long *)end;
91 } 99 }
92#endif 100#endif
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7271fa33d791..5cb5725b2bae 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -627,10 +627,9 @@ __init void e820_setup_gap(void)
627#ifdef CONFIG_X86_64 627#ifdef CONFIG_X86_64
628 if (!found) { 628 if (!found) {
629 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; 629 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
630 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " 630 printk(KERN_ERR
631 "address range\n" 631 "PCI: Warning: Cannot find a gap in the 32bit address range\n"
632 KERN_ERR "PCI: Unassigned devices with 32bit resource " 632 "PCI: Unassigned devices with 32bit resource registers may break!\n");
633 "registers may break!\n");
634 } 633 }
635#endif 634#endif
636 635
@@ -1383,6 +1382,8 @@ static unsigned long ram_alignment(resource_size_t pos)
1383 return 32*1024*1024; 1382 return 32*1024*1024;
1384} 1383}
1385 1384
1385#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
1386
1386void __init e820_reserve_resources_late(void) 1387void __init e820_reserve_resources_late(void)
1387{ 1388{
1388 int i; 1389 int i;
@@ -1400,17 +1401,19 @@ void __init e820_reserve_resources_late(void)
1400 * avoid stolen RAM: 1401 * avoid stolen RAM:
1401 */ 1402 */
1402 for (i = 0; i < e820.nr_map; i++) { 1403 for (i = 0; i < e820.nr_map; i++) {
1403 struct e820entry *entry = &e820_saved.map[i]; 1404 struct e820entry *entry = &e820.map[i];
1404 resource_size_t start, end; 1405 u64 start, end;
1405 1406
1406 if (entry->type != E820_RAM) 1407 if (entry->type != E820_RAM)
1407 continue; 1408 continue;
1408 start = entry->addr + entry->size; 1409 start = entry->addr + entry->size;
1409 end = round_up(start, ram_alignment(start)); 1410 end = round_up(start, ram_alignment(start)) - 1;
1410 if (start == end) 1411 if (end > MAX_RESOURCE_SIZE)
1412 end = MAX_RESOURCE_SIZE;
1413 if (start >= end)
1411 continue; 1414 continue;
1412 reserve_region_with_split(&iomem_resource, start, 1415 reserve_region_with_split(&iomem_resource, start, end,
1413 end - 1, "RAM buffer"); 1416 "RAM buffer");
1414 } 1417 }
1415} 1418}
1416 1419
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 96f7ac0bbf01..19ccf6d0dccf 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -512,7 +512,7 @@ void __init efi_enter_virtual_mode(void)
512 && end_pfn <= max_pfn_mapped)) 512 && end_pfn <= max_pfn_mapped))
513 va = __va(md->phys_addr); 513 va = __va(md->phys_addr);
514 else 514 else
515 va = efi_ioremap(md->phys_addr, size); 515 va = efi_ioremap(md->phys_addr, size, md->type);
516 516
517 md->virt_addr = (u64) (unsigned long) va; 517 md->virt_addr = (u64) (unsigned long) va;
518 518
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 22c3b7828c50..ac0621a7ac3d 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -98,10 +98,14 @@ void __init efi_call_phys_epilog(void)
98 early_runtime_code_mapping_set_exec(0); 98 early_runtime_code_mapping_set_exec(0);
99} 99}
100 100
101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) 101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
102 u32 type)
102{ 103{
103 unsigned long last_map_pfn; 104 unsigned long last_map_pfn;
104 105
106 if (type == EFI_MEMORY_MAPPED_IO)
107 return ioremap(phys_addr, size);
108
105 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); 109 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
106 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) 110 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
107 return NULL; 111 return NULL;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 8663afb56535..0d98a01cbdb2 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -602,7 +602,11 @@ ignore_int:
602#endif 602#endif
603 iret 603 iret
604 604
605.section .cpuinit.data,"wa" 605#ifndef CONFIG_HOTPLUG_CPU
606 __CPUINITDATA
607#else
608 __REFDATA
609#endif
606.align 4 610.align 4
607ENTRY(initial_code) 611ENTRY(initial_code)
608 .long i386_start_kernel 612 .long i386_start_kernel
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 696f0e475c2d..92b7703d3d58 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -187,7 +187,7 @@ static void __init apic_intr_init(void)
187#ifdef CONFIG_X86_THERMAL_VECTOR 187#ifdef CONFIG_X86_THERMAL_VECTOR
188 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 188 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
189#endif 189#endif
190#ifdef CONFIG_X86_THRESHOLD 190#ifdef CONFIG_X86_MCE_THRESHOLD
191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
192#endif 192#endif
193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) 193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a78ecad0c900..c664d515f613 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -200,7 +200,7 @@ static void kvm_leave_lazy_mmu(void)
200 state->mode = paravirt_get_lazy_mode(); 200 state->mode = paravirt_get_lazy_mode();
201} 201}
202 202
203static void paravirt_ops_setup(void) 203static void __init paravirt_ops_setup(void)
204{ 204{
205 pv_info.name = "KVM"; 205 pv_info.name = "KVM";
206 pv_info.paravirt_enabled = 1; 206 pv_info.paravirt_enabled = 1;
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 846510b78a09..2a62d843f015 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -347,7 +347,7 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id)
347 347
348static struct irqaction mfgptirq = { 348static struct irqaction mfgptirq = {
349 .handler = mfgpt_tick, 349 .handler = mfgpt_tick,
350 .flags = IRQF_DISABLED | IRQF_NOBALANCING, 350 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
351 .name = "mfgpt-timer" 351 .name = "mfgpt-timer"
352}; 352};
353 353
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 47630479b067..1a041bcf506b 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -211,11 +211,11 @@ static __init int iommu_setup(char *p)
211#ifdef CONFIG_SWIOTLB 211#ifdef CONFIG_SWIOTLB
212 if (!strncmp(p, "soft", 4)) 212 if (!strncmp(p, "soft", 4))
213 swiotlb = 1; 213 swiotlb = 1;
214#endif
214 if (!strncmp(p, "pt", 2)) { 215 if (!strncmp(p, "pt", 2)) {
215 iommu_pass_through = 1; 216 iommu_pass_through = 1;
216 return 1; 217 return 1;
217 } 218 }
218#endif
219 219
220 gart_parse_options(p); 220 gart_parse_options(p);
221 221
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index cfd9f9063896..d2e56b8f48e7 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -675,7 +675,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
675 nommu: 675 nommu:
676 /* Should not happen anymore */ 676 /* Should not happen anymore */
677 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 677 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
678 KERN_WARNING "falling back to iommu=soft.\n"); 678 "falling back to iommu=soft.\n");
679 return -1; 679 return -1;
680} 680}
681 681
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 4f9c55f3a7c0..03801f2f761f 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -60,7 +60,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
60 "adc %5,%%edx ; " 60 "adc %5,%%edx ; "
61 : "=A" (product), "=r" (tmp1), "=r" (tmp2) 61 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
62 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); 62 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
63#elif __x86_64__ 63#elif defined(__x86_64__)
64 __asm__ ( 64 __asm__ (
65 "mul %%rdx ; shrd $32,%%rdx,%%rax" 65 "mul %%rdx ; shrd $32,%%rdx,%%rax"
66 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); 66 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d2d1ce8170f0..834c9da8bf9d 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -3,6 +3,7 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/pm.h> 4#include <linux/pm.h>
5#include <linux/efi.h> 5#include <linux/efi.h>
6#include <linux/dmi.h>
6#include <acpi/reboot.h> 7#include <acpi/reboot.h>
7#include <asm/io.h> 8#include <asm/io.h>
8#include <asm/apic.h> 9#include <asm/apic.h>
@@ -17,7 +18,6 @@
17#include <asm/cpu.h> 18#include <asm/cpu.h>
18 19
19#ifdef CONFIG_X86_32 20#ifdef CONFIG_X86_32
20# include <linux/dmi.h>
21# include <linux/ctype.h> 21# include <linux/ctype.h>
22# include <linux/mc146818rtc.h> 22# include <linux/mc146818rtc.h>
23#else 23#else
@@ -249,6 +249,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
249 DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), 249 DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
250 }, 250 },
251 }, 251 },
252 { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */
253 .callback = set_bios_reboot,
254 .ident = "CompuLab SBC-FITPC2",
255 .matches = {
256 DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"),
257 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
258 },
259 },
252 { } 260 { }
253}; 261};
254 262
@@ -396,6 +404,38 @@ EXPORT_SYMBOL(machine_real_restart);
396 404
397#endif /* CONFIG_X86_32 */ 405#endif /* CONFIG_X86_32 */
398 406
407/*
408 * Apple MacBook5,2 (2009 MacBook) needs reboot=p
409 */
410static int __init set_pci_reboot(const struct dmi_system_id *d)
411{
412 if (reboot_type != BOOT_CF9) {
413 reboot_type = BOOT_CF9;
414 printk(KERN_INFO "%s series board detected. "
415 "Selecting PCI-method for reboots.\n", d->ident);
416 }
417 return 0;
418}
419
420static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
421 { /* Handle problems with rebooting on Apple MacBook5,2 */
422 .callback = set_pci_reboot,
423 .ident = "Apple MacBook",
424 .matches = {
425 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
426 DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5,2"),
427 },
428 },
429 { }
430};
431
432static int __init pci_reboot_init(void)
433{
434 dmi_check_system(pci_reboot_dmi_table);
435 return 0;
436}
437core_initcall(pci_reboot_init);
438
399static inline void kb_wait(void) 439static inline void kb_wait(void)
400{ 440{
401 int i; 441 int i;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index be5ae80f897f..63f32d220ef2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -289,6 +289,20 @@ void * __init extend_brk(size_t size, size_t align)
289 return ret; 289 return ret;
290} 290}
291 291
292#ifdef CONFIG_X86_64
293static void __init init_gbpages(void)
294{
295 if (direct_gbpages && cpu_has_gbpages)
296 printk(KERN_INFO "Using GB pages for direct mapping\n");
297 else
298 direct_gbpages = 0;
299}
300#else
301static inline void init_gbpages(void)
302{
303}
304#endif
305
292static void __init reserve_brk(void) 306static void __init reserve_brk(void)
293{ 307{
294 if (_brk_end > _brk_start) 308 if (_brk_end > _brk_start)
@@ -658,6 +672,19 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
658 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), 672 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
659 }, 673 },
660 }, 674 },
675 {
676 /*
677 * AMI BIOS with low memory corruption was found on Intel DG45ID board.
678 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
679 * match only DMI_BOARD_NAME and see if there is more bad products
680 * with this vendor.
681 */
682 .callback = dmi_low_memory_corruption,
683 .ident = "AMI BIOS",
684 .matches = {
685 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
686 },
687 },
661#endif 688#endif
662 {} 689 {}
663}; 690};
@@ -871,6 +898,8 @@ void __init setup_arch(char **cmdline_p)
871 898
872 reserve_brk(); 899 reserve_brk();
873 900
901 init_gbpages();
902
874 /* max_pfn_mapped is updated here */ 903 /* max_pfn_mapped is updated here */
875 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); 904 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
876 max_pfn_mapped = max_low_pfn_mapped; 905 max_pfn_mapped = max_low_pfn_mapped;
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 9c3f0823e6aa..29a3eef7cf4a 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -124,7 +124,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
124} 124}
125 125
126/* 126/*
127 * Remap allocator 127 * Large page remap allocator
128 * 128 *
129 * This allocator uses PMD page as unit. A PMD page is allocated for 129 * This allocator uses PMD page as unit. A PMD page is allocated for
130 * each cpu and each is remapped into vmalloc area using PMD mapping. 130 * each cpu and each is remapped into vmalloc area using PMD mapping.
@@ -137,105 +137,185 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
137 * better than only using 4k mappings while still being NUMA friendly. 137 * better than only using 4k mappings while still being NUMA friendly.
138 */ 138 */
139#ifdef CONFIG_NEED_MULTIPLE_NODES 139#ifdef CONFIG_NEED_MULTIPLE_NODES
140static size_t pcpur_size __initdata; 140struct pcpul_ent {
141static void **pcpur_ptrs __initdata; 141 unsigned int cpu;
142 void *ptr;
143};
144
145static size_t pcpul_size;
146static struct pcpul_ent *pcpul_map;
147static struct vm_struct pcpul_vm;
142 148
143static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) 149static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
144{ 150{
145 size_t off = (size_t)pageno << PAGE_SHIFT; 151 size_t off = (size_t)pageno << PAGE_SHIFT;
146 152
147 if (off >= pcpur_size) 153 if (off >= pcpul_size)
148 return NULL; 154 return NULL;
149 155
150 return virt_to_page(pcpur_ptrs[cpu] + off); 156 return virt_to_page(pcpul_map[cpu].ptr + off);
151} 157}
152 158
153static ssize_t __init setup_pcpu_remap(size_t static_size) 159static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
154{ 160{
155 static struct vm_struct vm; 161 size_t map_size, dyn_size;
156 size_t ptrs_size, dyn_size;
157 unsigned int cpu; 162 unsigned int cpu;
163 int i, j;
158 ssize_t ret; 164 ssize_t ret;
159 165
160 /* 166 if (!chosen) {
161 * If large page isn't supported, there's no benefit in doing 167 size_t vm_size = VMALLOC_END - VMALLOC_START;
162 * this. Also, on non-NUMA, embedding is better. 168 size_t tot_size = num_possible_cpus() * PMD_SIZE;
163 * 169
164 * NOTE: disabled for now. 170 /* on non-NUMA, embedding is better */
165 */ 171 if (!pcpu_need_numa())
166 if (true || !cpu_has_pse || !pcpu_need_numa()) 172 return -EINVAL;
173
174 /* don't consume more than 20% of vmalloc area */
175 if (tot_size > vm_size / 5) {
176 pr_info("PERCPU: too large chunk size %zuMB for "
177 "large page remap\n", tot_size >> 20);
178 return -EINVAL;
179 }
180 }
181
182 /* need PSE */
183 if (!cpu_has_pse) {
184 pr_warning("PERCPU: lpage allocator requires PSE\n");
167 return -EINVAL; 185 return -EINVAL;
186 }
168 187
169 /* 188 /*
170 * Currently supports only single page. Supporting multiple 189 * Currently supports only single page. Supporting multiple
171 * pages won't be too difficult if it ever becomes necessary. 190 * pages won't be too difficult if it ever becomes necessary.
172 */ 191 */
173 pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + 192 pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
174 PERCPU_DYNAMIC_RESERVE); 193 PERCPU_DYNAMIC_RESERVE);
175 if (pcpur_size > PMD_SIZE) { 194 if (pcpul_size > PMD_SIZE) {
176 pr_warning("PERCPU: static data is larger than large page, " 195 pr_warning("PERCPU: static data is larger than large page, "
177 "can't use large page\n"); 196 "can't use large page\n");
178 return -EINVAL; 197 return -EINVAL;
179 } 198 }
180 dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; 199 dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
181 200
182 /* allocate pointer array and alloc large pages */ 201 /* allocate pointer array and alloc large pages */
183 ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); 202 map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
184 pcpur_ptrs = alloc_bootmem(ptrs_size); 203 pcpul_map = alloc_bootmem(map_size);
185 204
186 for_each_possible_cpu(cpu) { 205 for_each_possible_cpu(cpu) {
187 pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); 206 pcpul_map[cpu].cpu = cpu;
188 if (!pcpur_ptrs[cpu]) 207 pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
208 PMD_SIZE);
209 if (!pcpul_map[cpu].ptr) {
210 pr_warning("PERCPU: failed to allocate large page "
211 "for cpu%u\n", cpu);
189 goto enomem; 212 goto enomem;
213 }
190 214
191 /* 215 /*
192 * Only use pcpur_size bytes and give back the rest. 216 * Only use pcpul_size bytes and give back the rest.
193 * 217 *
194 * Ingo: The 2MB up-rounding bootmem is needed to make 218 * Ingo: The 2MB up-rounding bootmem is needed to make
195 * sure the partial 2MB page is still fully RAM - it's 219 * sure the partial 2MB page is still fully RAM - it's
196 * not well-specified to have a PAT-incompatible area 220 * not well-specified to have a PAT-incompatible area
197 * (unmapped RAM, device memory, etc.) in that hole. 221 * (unmapped RAM, device memory, etc.) in that hole.
198 */ 222 */
199 free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), 223 free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
200 PMD_SIZE - pcpur_size); 224 PMD_SIZE - pcpul_size);
201 225
202 memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); 226 memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
203 } 227 }
204 228
205 /* allocate address and map */ 229 /* allocate address and map */
206 vm.flags = VM_ALLOC; 230 pcpul_vm.flags = VM_ALLOC;
207 vm.size = num_possible_cpus() * PMD_SIZE; 231 pcpul_vm.size = num_possible_cpus() * PMD_SIZE;
208 vm_area_register_early(&vm, PMD_SIZE); 232 vm_area_register_early(&pcpul_vm, PMD_SIZE);
209 233
210 for_each_possible_cpu(cpu) { 234 for_each_possible_cpu(cpu) {
211 pmd_t *pmd; 235 pmd_t *pmd, pmd_v;
212 236
213 pmd = populate_extra_pmd((unsigned long)vm.addr 237 pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
214 + cpu * PMD_SIZE); 238 cpu * PMD_SIZE);
215 set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), 239 pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
216 PAGE_KERNEL_LARGE)); 240 PAGE_KERNEL_LARGE);
241 set_pmd(pmd, pmd_v);
217 } 242 }
218 243
219 /* we're ready, commit */ 244 /* we're ready, commit */
220 pr_info("PERCPU: Remapped at %p with large pages, static data " 245 pr_info("PERCPU: Remapped at %p with large pages, static data "
221 "%zu bytes\n", vm.addr, static_size); 246 "%zu bytes\n", pcpul_vm.addr, static_size);
222 247
223 ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 248 ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
224 PERCPU_FIRST_CHUNK_RESERVE, dyn_size, 249 PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
225 PMD_SIZE, vm.addr, NULL); 250 PMD_SIZE, pcpul_vm.addr, NULL);
226 goto out_free_ar; 251
252 /* sort pcpul_map array for pcpu_lpage_remapped() */
253 for (i = 0; i < num_possible_cpus() - 1; i++)
254 for (j = i + 1; j < num_possible_cpus(); j++)
255 if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
256 struct pcpul_ent tmp = pcpul_map[i];
257 pcpul_map[i] = pcpul_map[j];
258 pcpul_map[j] = tmp;
259 }
260
261 return ret;
227 262
228enomem: 263enomem:
229 for_each_possible_cpu(cpu) 264 for_each_possible_cpu(cpu)
230 if (pcpur_ptrs[cpu]) 265 if (pcpul_map[cpu].ptr)
231 free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); 266 free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
232 ret = -ENOMEM; 267 free_bootmem(__pa(pcpul_map), map_size);
233out_free_ar: 268 return -ENOMEM;
234 free_bootmem(__pa(pcpur_ptrs), ptrs_size); 269}
235 return ret; 270
271/**
272 * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
273 * @kaddr: the kernel address in question
274 *
275 * Determine whether @kaddr falls in the pcpul recycled area. This is
276 * used by pageattr to detect VM aliases and break up the pcpu PMD
277 * mapping such that the same physical page is not mapped under
278 * different attributes.
279 *
280 * The recycled area is always at the tail of a partially used PMD
281 * page.
282 *
283 * RETURNS:
284 * Address of corresponding remapped pcpu address if match is found;
285 * otherwise, NULL.
286 */
287void *pcpu_lpage_remapped(void *kaddr)
288{
289 void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
290 unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
291 int left = 0, right = num_possible_cpus() - 1;
292 int pos;
293
294 /* pcpul in use at all? */
295 if (!pcpul_map)
296 return NULL;
297
298 /* okay, perform binary search */
299 while (left <= right) {
300 pos = (left + right) / 2;
301
302 if (pcpul_map[pos].ptr < pmd_addr)
303 left = pos + 1;
304 else if (pcpul_map[pos].ptr > pmd_addr)
305 right = pos - 1;
306 else {
307 /* it shouldn't be in the area for the first chunk */
308 WARN_ON(offset < pcpul_size);
309
310 return pcpul_vm.addr +
311 pcpul_map[pos].cpu * PMD_SIZE + offset;
312 }
313 }
314
315 return NULL;
236} 316}
237#else 317#else
238static ssize_t __init setup_pcpu_remap(size_t static_size) 318static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
239{ 319{
240 return -EINVAL; 320 return -EINVAL;
241} 321}
@@ -249,7 +329,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
249 * mapping so that it can use PMD mapping without additional TLB 329 * mapping so that it can use PMD mapping without additional TLB
250 * pressure. 330 * pressure.
251 */ 331 */
252static ssize_t __init setup_pcpu_embed(size_t static_size) 332static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
253{ 333{
254 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; 334 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
255 335
@@ -258,7 +338,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
258 * this. Also, embedding allocation doesn't play well with 338 * this. Also, embedding allocation doesn't play well with
259 * NUMA. 339 * NUMA.
260 */ 340 */
261 if (!cpu_has_pse || pcpu_need_numa()) 341 if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
262 return -EINVAL; 342 return -EINVAL;
263 343
264 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, 344 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
@@ -308,8 +388,11 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
308 void *ptr; 388 void *ptr;
309 389
310 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); 390 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
311 if (!ptr) 391 if (!ptr) {
392 pr_warning("PERCPU: failed to allocate "
393 "4k page for cpu%u\n", cpu);
312 goto enomem; 394 goto enomem;
395 }
313 396
314 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); 397 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
315 pcpu4k_pages[j++] = virt_to_page(ptr); 398 pcpu4k_pages[j++] = virt_to_page(ptr);
@@ -333,6 +416,16 @@ out_free_ar:
333 return ret; 416 return ret;
334} 417}
335 418
419/* for explicit first chunk allocator selection */
420static char pcpu_chosen_alloc[16] __initdata;
421
422static int __init percpu_alloc_setup(char *str)
423{
424 strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
425 return 0;
426}
427early_param("percpu_alloc", percpu_alloc_setup);
428
336static inline void setup_percpu_segment(int cpu) 429static inline void setup_percpu_segment(int cpu)
337{ 430{
338#ifdef CONFIG_X86_32 431#ifdef CONFIG_X86_32
@@ -346,11 +439,6 @@ static inline void setup_percpu_segment(int cpu)
346#endif 439#endif
347} 440}
348 441
349/*
350 * Great future plan:
351 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
352 * Always point %gs to its beginning
353 */
354void __init setup_per_cpu_areas(void) 442void __init setup_per_cpu_areas(void)
355{ 443{
356 size_t static_size = __per_cpu_end - __per_cpu_start; 444 size_t static_size = __per_cpu_end - __per_cpu_start;
@@ -367,9 +455,26 @@ void __init setup_per_cpu_areas(void)
367 * of large page mappings. Please read comments on top of 455 * of large page mappings. Please read comments on top of
368 * each allocator for details. 456 * each allocator for details.
369 */ 457 */
370 ret = setup_pcpu_remap(static_size); 458 ret = -EINVAL;
371 if (ret < 0) 459 if (strlen(pcpu_chosen_alloc)) {
372 ret = setup_pcpu_embed(static_size); 460 if (strcmp(pcpu_chosen_alloc, "4k")) {
461 if (!strcmp(pcpu_chosen_alloc, "lpage"))
462 ret = setup_pcpu_lpage(static_size, true);
463 else if (!strcmp(pcpu_chosen_alloc, "embed"))
464 ret = setup_pcpu_embed(static_size, true);
465 else
466 pr_warning("PERCPU: unknown allocator %s "
467 "specified\n", pcpu_chosen_alloc);
468 if (ret < 0)
469 pr_warning("PERCPU: %s allocator failed (%zd), "
470 "falling back to 4k\n",
471 pcpu_chosen_alloc, ret);
472 }
473 } else {
474 ret = setup_pcpu_lpage(static_size, false);
475 if (ret < 0)
476 ret = setup_pcpu_embed(static_size, false);
477 }
373 if (ret < 0) 478 if (ret < 0)
374 ret = setup_pcpu_4k(static_size); 479 ret = setup_pcpu_4k(static_size);
375 if (ret < 0) 480 if (ret < 0)
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 124d40c575df..8ccabb8a2f6a 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -711,7 +711,6 @@ uv_activation_descriptor_init(int node, int pnode)
711 unsigned long pa; 711 unsigned long pa;
712 unsigned long m; 712 unsigned long m;
713 unsigned long n; 713 unsigned long n;
714 unsigned long mmr_image;
715 struct bau_desc *adp; 714 struct bau_desc *adp;
716 struct bau_desc *ad2; 715 struct bau_desc *ad2;
717 716
@@ -727,12 +726,8 @@ uv_activation_descriptor_init(int node, int pnode)
727 n = pa >> uv_nshift; 726 n = pa >> uv_nshift;
728 m = pa & uv_mmask; 727 m = pa & uv_mmask;
729 728
730 mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE); 729 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
731 if (mmr_image) { 730 (n << UV_DESC_BASE_PNODE_SHIFT | m));
732 uv_write_global_mmr64(pnode, (unsigned long)
733 UVH_LB_BAU_SB_DESCRIPTOR_BASE,
734 (n << UV_DESC_BASE_PNODE_SHIFT | m));
735 }
736 731
737 /* 732 /*
738 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each 733 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a0f48f5671c0..5204332f475d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -346,6 +346,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
346 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); 346 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
347 show_registers(regs); 347 show_registers(regs);
348 348
349 if (panic_on_io_nmi)
350 panic("NMI IOCK error: Not continuing");
351
349 /* Re-enable the IOCK line, wait for a few seconds */ 352 /* Re-enable the IOCK line, wait for a few seconds */
350 reason = (reason & 0xf) | 8; 353 reason = (reason & 0xf) | 8;
351 outb(reason, 0x61); 354 outb(reason, 0x61);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 367e87882041..78d185d797de 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -112,11 +112,6 @@ SECTIONS
112 _sdata = .; 112 _sdata = .;
113 DATA_DATA 113 DATA_DATA
114 CONSTRUCTORS 114 CONSTRUCTORS
115
116#ifdef CONFIG_X86_64
117 /* End of data section */
118 _edata = .;
119#endif
120 } :data 115 } :data
121 116
122#ifdef CONFIG_X86_32 117#ifdef CONFIG_X86_32
@@ -156,10 +151,8 @@ SECTIONS
156 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { 151 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
157 *(.data.read_mostly) 152 *(.data.read_mostly)
158 153
159#ifdef CONFIG_X86_32
160 /* End of data section */ 154 /* End of data section */
161 _edata = .; 155 _edata = .;
162#endif
163 } 156 }
164 157
165#ifdef CONFIG_X86_64 158#ifdef CONFIG_X86_64
@@ -400,8 +393,8 @@ SECTIONS
400 393
401 394
402#ifdef CONFIG_X86_32 395#ifdef CONFIG_X86_32
403ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), 396. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
404 "kernel image bigger than KERNEL_IMAGE_SIZE") 397 "kernel image bigger than KERNEL_IMAGE_SIZE");
405#else 398#else
406/* 399/*
407 * Per-cpu symbols which need to be offset from __per_cpu_load 400 * Per-cpu symbols which need to be offset from __per_cpu_load
@@ -414,12 +407,12 @@ INIT_PER_CPU(irq_stack_union);
414/* 407/*
415 * Build-time check on the image size: 408 * Build-time check on the image size:
416 */ 409 */
417ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), 410. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
418 "kernel image bigger than KERNEL_IMAGE_SIZE") 411 "kernel image bigger than KERNEL_IMAGE_SIZE");
419 412
420#ifdef CONFIG_SMP 413#ifdef CONFIG_SMP
421ASSERT((per_cpu__irq_stack_union == 0), 414. = ASSERT((per_cpu__irq_stack_union == 0),
422 "irq_stack_union is not at start of per-cpu area"); 415 "irq_stack_union is not at start of per-cpu area");
423#endif 416#endif
424 417
425#endif /* CONFIG_X86_32 */ 418#endif /* CONFIG_X86_32 */
@@ -427,7 +420,7 @@ ASSERT((per_cpu__irq_stack_union == 0),
427#ifdef CONFIG_KEXEC 420#ifdef CONFIG_KEXEC
428#include <asm/kexec.h> 421#include <asm/kexec.h>
429 422
430ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, 423. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
431 "kexec control code size is too big") 424 "kexec control code size is too big");
432#endif 425#endif
433 426
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5c3d6e81a7dc..7030b5f911bf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2157,7 +2157,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2157 else 2157 else
2158 /* 32 bits PSE 4MB page */ 2158 /* 32 bits PSE 4MB page */
2159 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); 2159 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2160 context->rsvd_bits_mask[1][0] = ~0ull; 2160 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
2161 break; 2161 break;
2162 case PT32E_ROOT_LEVEL: 2162 case PT32E_ROOT_LEVEL:
2163 context->rsvd_bits_mask[0][2] = 2163 context->rsvd_bits_mask[0][2] =
@@ -2170,7 +2170,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2170 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2170 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2171 rsvd_bits(maxphyaddr, 62) | 2171 rsvd_bits(maxphyaddr, 62) |
2172 rsvd_bits(13, 20); /* large page */ 2172 rsvd_bits(13, 20); /* large page */
2173 context->rsvd_bits_mask[1][0] = ~0ull; 2173 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
2174 break; 2174 break;
2175 case PT64_ROOT_LEVEL: 2175 case PT64_ROOT_LEVEL:
2176 context->rsvd_bits_mask[0][3] = exb_bit_rsvd | 2176 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
@@ -2186,7 +2186,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2186 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2186 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2187 rsvd_bits(maxphyaddr, 51) | 2187 rsvd_bits(maxphyaddr, 51) |
2188 rsvd_bits(13, 20); /* large page */ 2188 rsvd_bits(13, 20); /* large page */
2189 context->rsvd_bits_mask[1][0] = ~0ull; 2189 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
2190 break; 2190 break;
2191 } 2191 }
2192} 2192}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 258e4591e1ca..67785f635399 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -281,7 +281,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
281{ 281{
282 unsigned access = gw->pt_access; 282 unsigned access = gw->pt_access;
283 struct kvm_mmu_page *shadow_page; 283 struct kvm_mmu_page *shadow_page;
284 u64 spte, *sptep; 284 u64 spte, *sptep = NULL;
285 int direct; 285 int direct;
286 gfn_t table_gfn; 286 gfn_t table_gfn;
287 int r; 287 int r;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e770bf349ec4..356a0ce85c68 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3012,6 +3012,12 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3012 return 1; 3012 return 1;
3013} 3013}
3014 3014
3015static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3016{
3017 kvm_queue_exception(vcpu, UD_VECTOR);
3018 return 1;
3019}
3020
3015static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3021static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3016{ 3022{
3017 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3023 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3198,6 +3204,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3198 [EXIT_REASON_HLT] = handle_halt, 3204 [EXIT_REASON_HLT] = handle_halt,
3199 [EXIT_REASON_INVLPG] = handle_invlpg, 3205 [EXIT_REASON_INVLPG] = handle_invlpg,
3200 [EXIT_REASON_VMCALL] = handle_vmcall, 3206 [EXIT_REASON_VMCALL] = handle_vmcall,
3207 [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
3208 [EXIT_REASON_VMLAUNCH] = handle_vmx_insn,
3209 [EXIT_REASON_VMPTRLD] = handle_vmx_insn,
3210 [EXIT_REASON_VMPTRST] = handle_vmx_insn,
3211 [EXIT_REASON_VMREAD] = handle_vmx_insn,
3212 [EXIT_REASON_VMRESUME] = handle_vmx_insn,
3213 [EXIT_REASON_VMWRITE] = handle_vmx_insn,
3214 [EXIT_REASON_VMOFF] = handle_vmx_insn,
3215 [EXIT_REASON_VMON] = handle_vmx_insn,
3201 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 3216 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
3202 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 3217 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
3203 [EXIT_REASON_WBINVD] = handle_wbinvd, 3218 [EXIT_REASON_WBINVD] = handle_wbinvd,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 249540f98513..fe5474aec41a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -898,6 +898,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
898 case MSR_VM_HSAVE_PA: 898 case MSR_VM_HSAVE_PA:
899 case MSR_P6_EVNTSEL0: 899 case MSR_P6_EVNTSEL0:
900 case MSR_P6_EVNTSEL1: 900 case MSR_P6_EVNTSEL1:
901 case MSR_K7_EVNTSEL0:
901 data = 0; 902 data = 0;
902 break; 903 break;
903 case MSR_MTRRcap: 904 case MSR_MTRRcap:
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index c1b6c232e02b..616de4628d60 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1361,7 +1361,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1361 return 0; 1361 return 0;
1362} 1362}
1363 1363
1364void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) 1364static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
1365{ 1365{
1366 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); 1366 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
1367 /* 1367 /*
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 7bc65f0f62c4..d677fa9ca650 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -22,7 +22,8 @@
22 * 22 *
23 * So how does the kernel know it's a Guest? We'll see that later, but let's 23 * So how does the kernel know it's a Guest? We'll see that later, but let's
24 * just say that we end up here where we replace the native functions various 24 * just say that we end up here where we replace the native functions various
25 * "paravirt" structures with our Guest versions, then boot like normal. :*/ 25 * "paravirt" structures with our Guest versions, then boot like normal.
26:*/
26 27
27/* 28/*
28 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. 29 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
@@ -74,7 +75,8 @@
74 * 75 *
75 * The Guest in our tale is a simple creature: identical to the Host but 76 * The Guest in our tale is a simple creature: identical to the Host but
76 * behaving in simplified but equivalent ways. In particular, the Guest is the 77 * behaving in simplified but equivalent ways. In particular, the Guest is the
77 * same kernel as the Host (or at least, built from the same source code). :*/ 78 * same kernel as the Host (or at least, built from the same source code).
79:*/
78 80
79struct lguest_data lguest_data = { 81struct lguest_data lguest_data = {
80 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, 82 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
@@ -85,7 +87,8 @@ struct lguest_data lguest_data = {
85 .syscall_vec = SYSCALL_VECTOR, 87 .syscall_vec = SYSCALL_VECTOR,
86}; 88};
87 89
88/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a 90/*G:037
91 * async_hcall() is pretty simple: I'm quite proud of it really. We have a
89 * ring buffer of stored hypercalls which the Host will run though next time we 92 * ring buffer of stored hypercalls which the Host will run though next time we
90 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall 93 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
91 * arguments, and a "hcall_status" word which is 0 if the call is ready to go, 94 * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
@@ -94,7 +97,8 @@ struct lguest_data lguest_data = {
94 * If we come around to a slot which hasn't been finished, then the table is 97 * If we come around to a slot which hasn't been finished, then the table is
95 * full and we just make the hypercall directly. This has the nice side 98 * full and we just make the hypercall directly. This has the nice side
96 * effect of causing the Host to run all the stored calls in the ring buffer 99 * effect of causing the Host to run all the stored calls in the ring buffer
97 * which empties it for next time! */ 100 * which empties it for next time!
101 */
98static void async_hcall(unsigned long call, unsigned long arg1, 102static void async_hcall(unsigned long call, unsigned long arg1,
99 unsigned long arg2, unsigned long arg3, 103 unsigned long arg2, unsigned long arg3,
100 unsigned long arg4) 104 unsigned long arg4)
@@ -103,9 +107,11 @@ static void async_hcall(unsigned long call, unsigned long arg1,
103 static unsigned int next_call; 107 static unsigned int next_call;
104 unsigned long flags; 108 unsigned long flags;
105 109
106 /* Disable interrupts if not already disabled: we don't want an 110 /*
111 * Disable interrupts if not already disabled: we don't want an
107 * interrupt handler making a hypercall while we're already doing 112 * interrupt handler making a hypercall while we're already doing
108 * one! */ 113 * one!
114 */
109 local_irq_save(flags); 115 local_irq_save(flags);
110 if (lguest_data.hcall_status[next_call] != 0xFF) { 116 if (lguest_data.hcall_status[next_call] != 0xFF) {
111 /* Table full, so do normal hcall which will flush table. */ 117 /* Table full, so do normal hcall which will flush table. */
@@ -125,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1,
125 local_irq_restore(flags); 131 local_irq_restore(flags);
126} 132}
127 133
128/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first 134/*G:035
129 * real optimization trick! 135 * Notice the lazy_hcall() above, rather than hcall(). This is our first real
136 * optimization trick!
130 * 137 *
131 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do 138 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
132 * them as a batch when lazy_mode is eventually turned off. Because hypercalls 139 * them as a batch when lazy_mode is eventually turned off. Because hypercalls
@@ -136,7 +143,8 @@ static void async_hcall(unsigned long call, unsigned long arg1,
136 * lguest_leave_lazy_mode(). 143 * lguest_leave_lazy_mode().
137 * 144 *
138 * So, when we're in lazy mode, we call async_hcall() to store the call for 145 * So, when we're in lazy mode, we call async_hcall() to store the call for
139 * future processing: */ 146 * future processing:
147 */
140static void lazy_hcall1(unsigned long call, 148static void lazy_hcall1(unsigned long call,
141 unsigned long arg1) 149 unsigned long arg1)
142{ 150{
@@ -146,6 +154,7 @@ static void lazy_hcall1(unsigned long call,
146 async_hcall(call, arg1, 0, 0, 0); 154 async_hcall(call, arg1, 0, 0, 0);
147} 155}
148 156
157/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
149static void lazy_hcall2(unsigned long call, 158static void lazy_hcall2(unsigned long call,
150 unsigned long arg1, 159 unsigned long arg1,
151 unsigned long arg2) 160 unsigned long arg2)
@@ -181,8 +190,10 @@ static void lazy_hcall4(unsigned long call,
181} 190}
182#endif 191#endif
183 192
184/* When lazy mode is turned off reset the per-cpu lazy mode variable and then 193/*G:036
185 * issue the do-nothing hypercall to flush any stored calls. */ 194 * When lazy mode is turned off reset the per-cpu lazy mode variable and then
195 * issue the do-nothing hypercall to flush any stored calls.
196:*/
186static void lguest_leave_lazy_mmu_mode(void) 197static void lguest_leave_lazy_mmu_mode(void)
187{ 198{
188 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 199 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
@@ -208,9 +219,11 @@ static void lguest_end_context_switch(struct task_struct *next)
208 * check there before it tries to deliver an interrupt. 219 * check there before it tries to deliver an interrupt.
209 */ 220 */
210 221
211/* save_flags() is expected to return the processor state (ie. "flags"). The 222/*
223 * save_flags() is expected to return the processor state (ie. "flags"). The
212 * flags word contains all kind of stuff, but in practice Linux only cares 224 * flags word contains all kind of stuff, but in practice Linux only cares
213 * about the interrupt flag. Our "save_flags()" just returns that. */ 225 * about the interrupt flag. Our "save_flags()" just returns that.
226 */
214static unsigned long save_fl(void) 227static unsigned long save_fl(void)
215{ 228{
216 return lguest_data.irq_enabled; 229 return lguest_data.irq_enabled;
@@ -222,13 +235,15 @@ static void irq_disable(void)
222 lguest_data.irq_enabled = 0; 235 lguest_data.irq_enabled = 0;
223} 236}
224 237
225/* Let's pause a moment. Remember how I said these are called so often? 238/*
239 * Let's pause a moment. Remember how I said these are called so often?
226 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to 240 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
227 * break some rules. In particular, these functions are assumed to save their 241 * break some rules. In particular, these functions are assumed to save their
228 * own registers if they need to: normal C functions assume they can trash the 242 * own registers if they need to: normal C functions assume they can trash the
229 * eax register. To use normal C functions, we use 243 * eax register. To use normal C functions, we use
230 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the 244 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
231 * C function, then restores it. */ 245 * C function, then restores it.
246 */
232PV_CALLEE_SAVE_REGS_THUNK(save_fl); 247PV_CALLEE_SAVE_REGS_THUNK(save_fl);
233PV_CALLEE_SAVE_REGS_THUNK(irq_disable); 248PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
234/*:*/ 249/*:*/
@@ -237,18 +252,18 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
237extern void lg_irq_enable(void); 252extern void lg_irq_enable(void);
238extern void lg_restore_fl(unsigned long flags); 253extern void lg_restore_fl(unsigned long flags);
239 254
240/*M:003 Note that we don't check for outstanding interrupts when we re-enable 255/*M:003
241 * them (or when we unmask an interrupt). This seems to work for the moment, 256 * We could be more efficient in our checking of outstanding interrupts, rather
242 * since interrupts are rare and we'll just get the interrupt on the next timer 257 * than using a branch. One way would be to put the "irq_enabled" field in a
243 * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way 258 * page by itself, and have the Host write-protect it when an interrupt comes
244 * would be to put the "irq_enabled" field in a page by itself, and have the 259 * in when irqs are disabled. There will then be a page fault as soon as
245 * Host write-protect it when an interrupt comes in when irqs are disabled. 260 * interrupts are re-enabled.
246 * There will then be a page fault as soon as interrupts are re-enabled.
247 * 261 *
248 * A better method is to implement soft interrupt disable generally for x86: 262 * A better method is to implement soft interrupt disable generally for x86:
249 * instead of disabling interrupts, we set a flag. If an interrupt does come 263 * instead of disabling interrupts, we set a flag. If an interrupt does come
250 * in, we then disable them for real. This is uncommon, so we could simply use 264 * in, we then disable them for real. This is uncommon, so we could simply use
251 * a hypercall for interrupt control and not worry about efficiency. :*/ 265 * a hypercall for interrupt control and not worry about efficiency.
266:*/
252 267
253/*G:034 268/*G:034
254 * The Interrupt Descriptor Table (IDT). 269 * The Interrupt Descriptor Table (IDT).
@@ -261,10 +276,12 @@ extern void lg_restore_fl(unsigned long flags);
261static void lguest_write_idt_entry(gate_desc *dt, 276static void lguest_write_idt_entry(gate_desc *dt,
262 int entrynum, const gate_desc *g) 277 int entrynum, const gate_desc *g)
263{ 278{
264 /* The gate_desc structure is 8 bytes long: we hand it to the Host in 279 /*
280 * The gate_desc structure is 8 bytes long: we hand it to the Host in
265 * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors 281 * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors
266 * around like this; typesafety wasn't a big concern in Linux's early 282 * around like this; typesafety wasn't a big concern in Linux's early
267 * years. */ 283 * years.
284 */
268 u32 *desc = (u32 *)g; 285 u32 *desc = (u32 *)g;
269 /* Keep the local copy up to date. */ 286 /* Keep the local copy up to date. */
270 native_write_idt_entry(dt, entrynum, g); 287 native_write_idt_entry(dt, entrynum, g);
@@ -272,9 +289,11 @@ static void lguest_write_idt_entry(gate_desc *dt,
272 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); 289 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
273} 290}
274 291
275/* Changing to a different IDT is very rare: we keep the IDT up-to-date every 292/*
293 * Changing to a different IDT is very rare: we keep the IDT up-to-date every
276 * time it is written, so we can simply loop through all entries and tell the 294 * time it is written, so we can simply loop through all entries and tell the
277 * Host about them. */ 295 * Host about them.
296 */
278static void lguest_load_idt(const struct desc_ptr *desc) 297static void lguest_load_idt(const struct desc_ptr *desc)
279{ 298{
280 unsigned int i; 299 unsigned int i;
@@ -305,9 +324,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc)
305 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); 324 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b);
306} 325}
307 326
308/* For a single GDT entry which changes, we do the lazy thing: alter our GDT, 327/*
328 * For a single GDT entry which changes, we do the lazy thing: alter our GDT,
309 * then tell the Host to reload the entire thing. This operation is so rare 329 * then tell the Host to reload the entire thing. This operation is so rare
310 * that this naive implementation is reasonable. */ 330 * that this naive implementation is reasonable.
331 */
311static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, 332static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
312 const void *desc, int type) 333 const void *desc, int type)
313{ 334{
@@ -317,29 +338,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
317 dt[entrynum].a, dt[entrynum].b); 338 dt[entrynum].a, dt[entrynum].b);
318} 339}
319 340
320/* OK, I lied. There are three "thread local storage" GDT entries which change 341/*
342 * OK, I lied. There are three "thread local storage" GDT entries which change
321 * on every context switch (these three entries are how glibc implements 343 * on every context switch (these three entries are how glibc implements
322 * __thread variables). So we have a hypercall specifically for this case. */ 344 * __thread variables). So we have a hypercall specifically for this case.
345 */
323static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) 346static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
324{ 347{
325 /* There's one problem which normal hardware doesn't have: the Host 348 /*
349 * There's one problem which normal hardware doesn't have: the Host
326 * can't handle us removing entries we're currently using. So we clear 350 * can't handle us removing entries we're currently using. So we clear
327 * the GS register here: if it's needed it'll be reloaded anyway. */ 351 * the GS register here: if it's needed it'll be reloaded anyway.
352 */
328 lazy_load_gs(0); 353 lazy_load_gs(0);
329 lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); 354 lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
330} 355}
331 356
332/*G:038 That's enough excitement for now, back to ploughing through each of 357/*G:038
333 * the different pv_ops structures (we're about 1/3 of the way through). 358 * That's enough excitement for now, back to ploughing through each of the
359 * different pv_ops structures (we're about 1/3 of the way through).
334 * 360 *
335 * This is the Local Descriptor Table, another weird Intel thingy. Linux only 361 * This is the Local Descriptor Table, another weird Intel thingy. Linux only
336 * uses this for some strange applications like Wine. We don't do anything 362 * uses this for some strange applications like Wine. We don't do anything
337 * here, so they'll get an informative and friendly Segmentation Fault. */ 363 * here, so they'll get an informative and friendly Segmentation Fault.
364 */
338static void lguest_set_ldt(const void *addr, unsigned entries) 365static void lguest_set_ldt(const void *addr, unsigned entries)
339{ 366{
340} 367}
341 368
342/* This loads a GDT entry into the "Task Register": that entry points to a 369/*
370 * This loads a GDT entry into the "Task Register": that entry points to a
343 * structure called the Task State Segment. Some comments scattered though the 371 * structure called the Task State Segment. Some comments scattered though the
344 * kernel code indicate that this used for task switching in ages past, along 372 * kernel code indicate that this used for task switching in ages past, along
345 * with blood sacrifice and astrology. 373 * with blood sacrifice and astrology.
@@ -347,19 +375,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries)
347 * Now there's nothing interesting in here that we don't get told elsewhere. 375 * Now there's nothing interesting in here that we don't get told elsewhere.
348 * But the native version uses the "ltr" instruction, which makes the Host 376 * But the native version uses the "ltr" instruction, which makes the Host
349 * complain to the Guest about a Segmentation Fault and it'll oops. So we 377 * complain to the Guest about a Segmentation Fault and it'll oops. So we
350 * override the native version with a do-nothing version. */ 378 * override the native version with a do-nothing version.
379 */
351static void lguest_load_tr_desc(void) 380static void lguest_load_tr_desc(void)
352{ 381{
353} 382}
354 383
355/* The "cpuid" instruction is a way of querying both the CPU identity 384/*
385 * The "cpuid" instruction is a way of querying both the CPU identity
356 * (manufacturer, model, etc) and its features. It was introduced before the 386 * (manufacturer, model, etc) and its features. It was introduced before the
357 * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. 387 * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
358 * As you might imagine, after a decade and a half this treatment, it is now a 388 * As you might imagine, after a decade and a half this treatment, it is now a
359 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. 389 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
360 * 390 *
361 * This instruction even it has its own Wikipedia entry. The Wikipedia entry 391 * This instruction even it has its own Wikipedia entry. The Wikipedia entry
362 * has been translated into 4 languages. I am not making this up! 392 * has been translated into 5 languages. I am not making this up!
363 * 393 *
364 * We could get funky here and identify ourselves as "GenuineLguest", but 394 * We could get funky here and identify ourselves as "GenuineLguest", but
365 * instead we just use the real "cpuid" instruction. Then I pretty much turned 395 * instead we just use the real "cpuid" instruction. Then I pretty much turned
@@ -371,7 +401,8 @@ static void lguest_load_tr_desc(void)
371 * Replacing the cpuid so we can turn features off is great for the kernel, but 401 * Replacing the cpuid so we can turn features off is great for the kernel, but
372 * anyone (including userspace) can just use the raw "cpuid" instruction and 402 * anyone (including userspace) can just use the raw "cpuid" instruction and
373 * the Host won't even notice since it isn't privileged. So we try not to get 403 * the Host won't even notice since it isn't privileged. So we try not to get
374 * too worked up about it. */ 404 * too worked up about it.
405 */
375static void lguest_cpuid(unsigned int *ax, unsigned int *bx, 406static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
376 unsigned int *cx, unsigned int *dx) 407 unsigned int *cx, unsigned int *dx)
377{ 408{
@@ -379,38 +410,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
379 410
380 native_cpuid(ax, bx, cx, dx); 411 native_cpuid(ax, bx, cx, dx);
381 switch (function) { 412 switch (function) {
382 case 1: /* Basic feature request. */ 413 /*
383 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 414 * CPUID 0 gives the highest legal CPUID number (and the ID string).
415 * We futureproof our code a little by sticking to known CPUID values.
416 */
417 case 0:
418 if (*ax > 5)
419 *ax = 5;
420 break;
421
422 /*
423 * CPUID 1 is a basic feature request.
424 *
425 * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
426 * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
427 */
428 case 1:
384 *cx &= 0x00002201; 429 *cx &= 0x00002201;
385 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */
386 *dx &= 0x07808151; 430 *dx &= 0x07808151;
387 /* The Host can do a nice optimization if it knows that the 431 /*
432 * The Host can do a nice optimization if it knows that the
388 * kernel mappings (addresses above 0xC0000000 or whatever 433 * kernel mappings (addresses above 0xC0000000 or whatever
389 * PAGE_OFFSET is set to) haven't changed. But Linux calls 434 * PAGE_OFFSET is set to) haven't changed. But Linux calls
390 * flush_tlb_user() for both user and kernel mappings unless 435 * flush_tlb_user() for both user and kernel mappings unless
391 * the Page Global Enable (PGE) feature bit is set. */ 436 * the Page Global Enable (PGE) feature bit is set.
437 */
392 *dx |= 0x00002000; 438 *dx |= 0x00002000;
393 /* We also lie, and say we're family id 5. 6 or greater 439 /*
440 * We also lie, and say we're family id 5. 6 or greater
394 * leads to a rdmsr in early_init_intel which we can't handle. 441 * leads to a rdmsr in early_init_intel which we can't handle.
395 * Family ID is returned as bits 8-12 in ax. */ 442 * Family ID is returned as bits 8-12 in ax.
443 */
396 *ax &= 0xFFFFF0FF; 444 *ax &= 0xFFFFF0FF;
397 *ax |= 0x00000500; 445 *ax |= 0x00000500;
398 break; 446 break;
447 /*
448 * 0x80000000 returns the highest Extended Function, so we futureproof
449 * like we do above by limiting it to known fields.
450 */
399 case 0x80000000: 451 case 0x80000000:
400 /* Futureproof this a little: if they ask how much extended
401 * processor information there is, limit it to known fields. */
402 if (*ax > 0x80000008) 452 if (*ax > 0x80000008)
403 *ax = 0x80000008; 453 *ax = 0x80000008;
404 break; 454 break;
455
456 /*
457 * PAE systems can mark pages as non-executable. Linux calls this the
458 * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
459 * Virus Protection). We just switch turn if off here, since we don't
460 * support it.
461 */
405 case 0x80000001: 462 case 0x80000001:
406 /* Here we should fix nx cap depending on host. */
407 /* For this version of PAE, we just clear NX bit. */
408 *dx &= ~(1 << 20); 463 *dx &= ~(1 << 20);
409 break; 464 break;
410 } 465 }
411} 466}
412 467
413/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. 468/*
469 * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
414 * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother 470 * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
415 * it. The Host needs to know when the Guest wants to change them, so we have 471 * it. The Host needs to know when the Guest wants to change them, so we have
416 * a whole series of functions like read_cr0() and write_cr0(). 472 * a whole series of functions like read_cr0() and write_cr0().
@@ -425,7 +481,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
425 * name like "FPUTRAP bit" be a little less cryptic? 481 * name like "FPUTRAP bit" be a little less cryptic?
426 * 482 *
427 * We store cr0 locally because the Host never changes it. The Guest sometimes 483 * We store cr0 locally because the Host never changes it. The Guest sometimes
428 * wants to read it and we'd prefer not to bother the Host unnecessarily. */ 484 * wants to read it and we'd prefer not to bother the Host unnecessarily.
485 */
429static unsigned long current_cr0; 486static unsigned long current_cr0;
430static void lguest_write_cr0(unsigned long val) 487static void lguest_write_cr0(unsigned long val)
431{ 488{
@@ -438,18 +495,22 @@ static unsigned long lguest_read_cr0(void)
438 return current_cr0; 495 return current_cr0;
439} 496}
440 497
441/* Intel provided a special instruction to clear the TS bit for people too cool 498/*
499 * Intel provided a special instruction to clear the TS bit for people too cool
442 * to use write_cr0() to do it. This "clts" instruction is faster, because all 500 * to use write_cr0() to do it. This "clts" instruction is faster, because all
443 * the vowels have been optimized out. */ 501 * the vowels have been optimized out.
502 */
444static void lguest_clts(void) 503static void lguest_clts(void)
445{ 504{
446 lazy_hcall1(LHCALL_TS, 0); 505 lazy_hcall1(LHCALL_TS, 0);
447 current_cr0 &= ~X86_CR0_TS; 506 current_cr0 &= ~X86_CR0_TS;
448} 507}
449 508
450/* cr2 is the virtual address of the last page fault, which the Guest only ever 509/*
510 * cr2 is the virtual address of the last page fault, which the Guest only ever
451 * reads. The Host kindly writes this into our "struct lguest_data", so we 511 * reads. The Host kindly writes this into our "struct lguest_data", so we
452 * just read it out of there. */ 512 * just read it out of there.
513 */
453static unsigned long lguest_read_cr2(void) 514static unsigned long lguest_read_cr2(void)
454{ 515{
455 return lguest_data.cr2; 516 return lguest_data.cr2;
@@ -458,10 +519,12 @@ static unsigned long lguest_read_cr2(void)
458/* See lguest_set_pte() below. */ 519/* See lguest_set_pte() below. */
459static bool cr3_changed = false; 520static bool cr3_changed = false;
460 521
461/* cr3 is the current toplevel pagetable page: the principle is the same as 522/*
523 * cr3 is the current toplevel pagetable page: the principle is the same as
462 * cr0. Keep a local copy, and tell the Host when it changes. The only 524 * cr0. Keep a local copy, and tell the Host when it changes. The only
463 * difference is that our local copy is in lguest_data because the Host needs 525 * difference is that our local copy is in lguest_data because the Host needs
464 * to set it upon our initial hypercall. */ 526 * to set it upon our initial hypercall.
527 */
465static void lguest_write_cr3(unsigned long cr3) 528static void lguest_write_cr3(unsigned long cr3)
466{ 529{
467 lguest_data.pgdir = cr3; 530 lguest_data.pgdir = cr3;
@@ -506,7 +569,7 @@ static void lguest_write_cr4(unsigned long val)
506 * cr3 ---> +---------+ 569 * cr3 ---> +---------+
507 * | --------->+---------+ 570 * | --------->+---------+
508 * | | | PADDR1 | 571 * | | | PADDR1 |
509 * Top-level | | PADDR2 | 572 * Mid-level | | PADDR2 |
510 * (PMD) page | | | 573 * (PMD) page | | |
511 * | | Lower-level | 574 * | | Lower-level |
512 * | | (PTE) page | 575 * | | (PTE) page |
@@ -526,21 +589,62 @@ static void lguest_write_cr4(unsigned long val)
526 * Index into top Index into second Offset within page 589 * Index into top Index into second Offset within page
527 * page directory page pagetable page 590 * page directory page pagetable page
528 * 591 *
529 * The kernel spends a lot of time changing both the top-level page directory 592 * Now, unfortunately, this isn't the whole story: Intel added Physical Address
530 * and lower-level pagetable pages. The Guest doesn't know physical addresses, 593 * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
531 * so while it maintains these page tables exactly like normal, it also needs 594 * These are held in 64-bit page table entries, so we can now only fit 512
532 * to keep the Host informed whenever it makes a change: the Host will create 595 * entries in a page, and the neat three-level tree breaks down.
533 * the real page tables based on the Guests'. 596 *
597 * The result is a four level page table:
598 *
599 * cr3 --> [ 4 Upper ]
600 * [ Level ]
601 * [ Entries ]
602 * [(PUD Page)]---> +---------+
603 * | --------->+---------+
604 * | | | PADDR1 |
605 * Mid-level | | PADDR2 |
606 * (PMD) page | | |
607 * | | Lower-level |
608 * | | (PTE) page |
609 * | | | |
610 * .... ....
611 *
612 *
613 * And the virtual address is decoded as:
614 *
615 * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
616 * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
617 * Index into Index into mid Index into lower Offset within page
618 * top entries directory page pagetable page
619 *
620 * It's too hard to switch between these two formats at runtime, so Linux only
621 * supports one or the other depending on whether CONFIG_X86_PAE is set. Many
622 * distributions turn it on, and not just for people with silly amounts of
623 * memory: the larger PTE entries allow room for the NX bit, which lets the
624 * kernel disable execution of pages and increase security.
625 *
626 * This was a problem for lguest, which couldn't run on these distributions;
627 * then Matias Zabaljauregui figured it all out and implemented it, and only a
628 * handful of puppies were crushed in the process!
629 *
630 * Back to our point: the kernel spends a lot of time changing both the
631 * top-level page directory and lower-level pagetable pages. The Guest doesn't
632 * know physical addresses, so while it maintains these page tables exactly
633 * like normal, it also needs to keep the Host informed whenever it makes a
634 * change: the Host will create the real page tables based on the Guests'.
534 */ 635 */
535 636
536/* The Guest calls this to set a second-level entry (pte), ie. to map a page 637/*
537 * into a process' address space. We set the entry then tell the Host the 638 * The Guest calls this after it has set a second-level entry (pte), ie. to map
538 * toplevel and address this corresponds to. The Guest uses one pagetable per 639 * a page into a process' address space. Wetell the Host the toplevel and
539 * process, so we need to tell the Host which one we're changing (mm->pgd). */ 640 * address this corresponds to. The Guest uses one pagetable per process, so
641 * we need to tell the Host which one we're changing (mm->pgd).
642 */
540static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 643static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
541 pte_t *ptep) 644 pte_t *ptep)
542{ 645{
543#ifdef CONFIG_X86_PAE 646#ifdef CONFIG_X86_PAE
647 /* PAE needs to hand a 64 bit page table entry, so it uses two args. */
544 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, 648 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
545 ptep->pte_low, ptep->pte_high); 649 ptep->pte_low, ptep->pte_high);
546#else 650#else
@@ -548,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
548#endif 652#endif
549} 653}
550 654
655/* This is the "set and update" combo-meal-deal version. */
551static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 656static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
552 pte_t *ptep, pte_t pteval) 657 pte_t *ptep, pte_t pteval)
553{ 658{
@@ -555,10 +660,13 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
555 lguest_pte_update(mm, addr, ptep); 660 lguest_pte_update(mm, addr, ptep);
556} 661}
557 662
558/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd 663/*
664 * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
559 * to set a middle-level entry when PAE is activated. 665 * to set a middle-level entry when PAE is activated.
666 *
560 * Again, we set the entry then tell the Host which page we changed, 667 * Again, we set the entry then tell the Host which page we changed,
561 * and the index of the entry we changed. */ 668 * and the index of the entry we changed.
669 */
562#ifdef CONFIG_X86_PAE 670#ifdef CONFIG_X86_PAE
563static void lguest_set_pud(pud_t *pudp, pud_t pudval) 671static void lguest_set_pud(pud_t *pudp, pud_t pudval)
564{ 672{
@@ -577,8 +685,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
577} 685}
578#else 686#else
579 687
580/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not 688/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
581 * activated. */
582static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 689static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
583{ 690{
584 native_set_pmd(pmdp, pmdval); 691 native_set_pmd(pmdp, pmdval);
@@ -587,7 +694,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
587} 694}
588#endif 695#endif
589 696
590/* There are a couple of legacy places where the kernel sets a PTE, but we 697/*
698 * There are a couple of legacy places where the kernel sets a PTE, but we
591 * don't know the top level any more. This is useless for us, since we don't 699 * don't know the top level any more. This is useless for us, since we don't
592 * know which pagetable is changing or what address, so we just tell the Host 700 * know which pagetable is changing or what address, so we just tell the Host
593 * to forget all of them. Fortunately, this is very rare. 701 * to forget all of them. Fortunately, this is very rare.
@@ -595,7 +703,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
595 * ... except in early boot when the kernel sets up the initial pagetables, 703 * ... except in early boot when the kernel sets up the initial pagetables,
596 * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell 704 * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell
597 * the Host anything changed until we've done the first page table switch, 705 * the Host anything changed until we've done the first page table switch,
598 * which brings boot back to 0.25 seconds. */ 706 * which brings boot back to 0.25 seconds.
707 */
599static void lguest_set_pte(pte_t *ptep, pte_t pteval) 708static void lguest_set_pte(pte_t *ptep, pte_t pteval)
600{ 709{
601 native_set_pte(ptep, pteval); 710 native_set_pte(ptep, pteval);
@@ -604,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
604} 713}
605 714
606#ifdef CONFIG_X86_PAE 715#ifdef CONFIG_X86_PAE
716/*
717 * With 64-bit PTE values, we need to be careful setting them: if we set 32
718 * bits at a time, the hardware could see a weird half-set entry. These
719 * versions ensure we update all 64 bits at once.
720 */
607static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) 721static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
608{ 722{
609 native_set_pte_atomic(ptep, pte); 723 native_set_pte_atomic(ptep, pte);
@@ -611,19 +725,21 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
611 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 725 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
612} 726}
613 727
614void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 728static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
729 pte_t *ptep)
615{ 730{
616 native_pte_clear(mm, addr, ptep); 731 native_pte_clear(mm, addr, ptep);
617 lguest_pte_update(mm, addr, ptep); 732 lguest_pte_update(mm, addr, ptep);
618} 733}
619 734
620void lguest_pmd_clear(pmd_t *pmdp) 735static void lguest_pmd_clear(pmd_t *pmdp)
621{ 736{
622 lguest_set_pmd(pmdp, __pmd(0)); 737 lguest_set_pmd(pmdp, __pmd(0));
623} 738}
624#endif 739#endif
625 740
626/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 741/*
742 * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
627 * native page table operations. On native hardware you can set a new page 743 * native page table operations. On native hardware you can set a new page
628 * table entry whenever you want, but if you want to remove one you have to do 744 * table entry whenever you want, but if you want to remove one you have to do
629 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). 745 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
@@ -632,24 +748,29 @@ void lguest_pmd_clear(pmd_t *pmdp)
632 * called when a valid entry is written, not when it's removed (ie. marked not 748 * called when a valid entry is written, not when it's removed (ie. marked not
633 * present). Instead, this is where we come when the Guest wants to remove a 749 * present). Instead, this is where we come when the Guest wants to remove a
634 * page table entry: we tell the Host to set that entry to 0 (ie. the present 750 * page table entry: we tell the Host to set that entry to 0 (ie. the present
635 * bit is zero). */ 751 * bit is zero).
752 */
636static void lguest_flush_tlb_single(unsigned long addr) 753static void lguest_flush_tlb_single(unsigned long addr)
637{ 754{
638 /* Simply set it to zero: if it was not, it will fault back in. */ 755 /* Simply set it to zero: if it was not, it will fault back in. */
639 lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); 756 lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
640} 757}
641 758
642/* This is what happens after the Guest has removed a large number of entries. 759/*
760 * This is what happens after the Guest has removed a large number of entries.
643 * This tells the Host that any of the page table entries for userspace might 761 * This tells the Host that any of the page table entries for userspace might
644 * have changed, ie. virtual addresses below PAGE_OFFSET. */ 762 * have changed, ie. virtual addresses below PAGE_OFFSET.
763 */
645static void lguest_flush_tlb_user(void) 764static void lguest_flush_tlb_user(void)
646{ 765{
647 lazy_hcall1(LHCALL_FLUSH_TLB, 0); 766 lazy_hcall1(LHCALL_FLUSH_TLB, 0);
648} 767}
649 768
650/* This is called when the kernel page tables have changed. That's not very 769/*
770 * This is called when the kernel page tables have changed. That's not very
651 * common (unless the Guest is using highmem, which makes the Guest extremely 771 * common (unless the Guest is using highmem, which makes the Guest extremely
652 * slow), so it's worth separating this from the user flushing above. */ 772 * slow), so it's worth separating this from the user flushing above.
773 */
653static void lguest_flush_tlb_kernel(void) 774static void lguest_flush_tlb_kernel(void)
654{ 775{
655 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 776 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
@@ -686,26 +807,38 @@ static struct irq_chip lguest_irq_controller = {
686 .unmask = enable_lguest_irq, 807 .unmask = enable_lguest_irq,
687}; 808};
688 809
689/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware 810/*
811 * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
690 * interrupt (except 128, which is used for system calls), and then tells the 812 * interrupt (except 128, which is used for system calls), and then tells the
691 * Linux infrastructure that each interrupt is controlled by our level-based 813 * Linux infrastructure that each interrupt is controlled by our level-based
692 * lguest interrupt controller. */ 814 * lguest interrupt controller.
815 */
693static void __init lguest_init_IRQ(void) 816static void __init lguest_init_IRQ(void)
694{ 817{
695 unsigned int i; 818 unsigned int i;
696 819
697 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 820 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
698 /* Some systems map "vectors" to interrupts weirdly. Lguest has 821 /* Some systems map "vectors" to interrupts weirdly. Not us! */
699 * a straightforward 1 to 1 mapping, so force that here. */
700 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; 822 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
701 if (i != SYSCALL_VECTOR) 823 if (i != SYSCALL_VECTOR)
702 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 824 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
703 } 825 }
704 /* This call is required to set up for 4k stacks, where we have 826
705 * separate stacks for hard and soft interrupts. */ 827 /*
828 * This call is required to set up for 4k stacks, where we have
829 * separate stacks for hard and soft interrupts.
830 */
706 irq_ctx_init(smp_processor_id()); 831 irq_ctx_init(smp_processor_id());
707} 832}
708 833
834/*
835 * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so
836 * rather than set them in lguest_init_IRQ we are called here every time an
837 * lguest device needs an interrupt.
838 *
839 * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should
840 * pass that up!
841 */
709void lguest_setup_irq(unsigned int irq) 842void lguest_setup_irq(unsigned int irq)
710{ 843{
711 irq_to_desc_alloc_node(irq, 0); 844 irq_to_desc_alloc_node(irq, 0);
@@ -724,31 +857,39 @@ static unsigned long lguest_get_wallclock(void)
724 return lguest_data.time.tv_sec; 857 return lguest_data.time.tv_sec;
725} 858}
726 859
727/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us 860/*
861 * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us
728 * what speed it runs at, or 0 if it's unusable as a reliable clock source. 862 * what speed it runs at, or 0 if it's unusable as a reliable clock source.
729 * This matches what we want here: if we return 0 from this function, the x86 863 * This matches what we want here: if we return 0 from this function, the x86
730 * TSC clock will give up and not register itself. */ 864 * TSC clock will give up and not register itself.
865 */
731static unsigned long lguest_tsc_khz(void) 866static unsigned long lguest_tsc_khz(void)
732{ 867{
733 return lguest_data.tsc_khz; 868 return lguest_data.tsc_khz;
734} 869}
735 870
736/* If we can't use the TSC, the kernel falls back to our lower-priority 871/*
737 * "lguest_clock", where we read the time value given to us by the Host. */ 872 * If we can't use the TSC, the kernel falls back to our lower-priority
873 * "lguest_clock", where we read the time value given to us by the Host.
874 */
738static cycle_t lguest_clock_read(struct clocksource *cs) 875static cycle_t lguest_clock_read(struct clocksource *cs)
739{ 876{
740 unsigned long sec, nsec; 877 unsigned long sec, nsec;
741 878
742 /* Since the time is in two parts (seconds and nanoseconds), we risk 879 /*
880 * Since the time is in two parts (seconds and nanoseconds), we risk
743 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, 881 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
744 * and getting 99 and 0. As Linux tends to come apart under the stress 882 * and getting 99 and 0. As Linux tends to come apart under the stress
745 * of time travel, we must be careful: */ 883 * of time travel, we must be careful:
884 */
746 do { 885 do {
747 /* First we read the seconds part. */ 886 /* First we read the seconds part. */
748 sec = lguest_data.time.tv_sec; 887 sec = lguest_data.time.tv_sec;
749 /* This read memory barrier tells the compiler and the CPU that 888 /*
889 * This read memory barrier tells the compiler and the CPU that
750 * this can't be reordered: we have to complete the above 890 * this can't be reordered: we have to complete the above
751 * before going on. */ 891 * before going on.
892 */
752 rmb(); 893 rmb();
753 /* Now we read the nanoseconds part. */ 894 /* Now we read the nanoseconds part. */
754 nsec = lguest_data.time.tv_nsec; 895 nsec = lguest_data.time.tv_nsec;
@@ -772,9 +913,11 @@ static struct clocksource lguest_clock = {
772 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 913 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
773}; 914};
774 915
775/* We also need a "struct clock_event_device": Linux asks us to set it to go 916/*
917 * We also need a "struct clock_event_device": Linux asks us to set it to go
776 * off some time in the future. Actually, James Morris figured all this out, I 918 * off some time in the future. Actually, James Morris figured all this out, I
777 * just applied the patch. */ 919 * just applied the patch.
920 */
778static int lguest_clockevent_set_next_event(unsigned long delta, 921static int lguest_clockevent_set_next_event(unsigned long delta,
779 struct clock_event_device *evt) 922 struct clock_event_device *evt)
780{ 923{
@@ -824,8 +967,10 @@ static struct clock_event_device lguest_clockevent = {
824 .max_delta_ns = LG_CLOCK_MAX_DELTA, 967 .max_delta_ns = LG_CLOCK_MAX_DELTA,
825}; 968};
826 969
827/* This is the Guest timer interrupt handler (hardware interrupt 0). We just 970/*
828 * call the clockevent infrastructure and it does whatever needs doing. */ 971 * This is the Guest timer interrupt handler (hardware interrupt 0). We just
972 * call the clockevent infrastructure and it does whatever needs doing.
973 */
829static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) 974static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
830{ 975{
831 unsigned long flags; 976 unsigned long flags;
@@ -836,10 +981,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
836 local_irq_restore(flags); 981 local_irq_restore(flags);
837} 982}
838 983
839/* At some point in the boot process, we get asked to set up our timing 984/*
985 * At some point in the boot process, we get asked to set up our timing
840 * infrastructure. The kernel doesn't expect timer interrupts before this, but 986 * infrastructure. The kernel doesn't expect timer interrupts before this, but
841 * we cleverly initialized the "blocked_interrupts" field of "struct 987 * we cleverly initialized the "blocked_interrupts" field of "struct
842 * lguest_data" so that timer interrupts were blocked until now. */ 988 * lguest_data" so that timer interrupts were blocked until now.
989 */
843static void lguest_time_init(void) 990static void lguest_time_init(void)
844{ 991{
845 /* Set up the timer interrupt (0) to go to our simple timer routine */ 992 /* Set up the timer interrupt (0) to go to our simple timer routine */
@@ -863,14 +1010,16 @@ static void lguest_time_init(void)
863 * to work. They're pretty simple. 1010 * to work. They're pretty simple.
864 */ 1011 */
865 1012
866/* The Guest needs to tell the Host what stack it expects traps to use. For 1013/*
1014 * The Guest needs to tell the Host what stack it expects traps to use. For
867 * native hardware, this is part of the Task State Segment mentioned above in 1015 * native hardware, this is part of the Task State Segment mentioned above in
868 * lguest_load_tr_desc(), but to help hypervisors there's this special call. 1016 * lguest_load_tr_desc(), but to help hypervisors there's this special call.
869 * 1017 *
870 * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data 1018 * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
871 * segment), the privilege level (we're privilege level 1, the Host is 0 and 1019 * segment), the privilege level (we're privilege level 1, the Host is 0 and
872 * will not tolerate us trying to use that), the stack pointer, and the number 1020 * will not tolerate us trying to use that), the stack pointer, and the number
873 * of pages in the stack. */ 1021 * of pages in the stack.
1022 */
874static void lguest_load_sp0(struct tss_struct *tss, 1023static void lguest_load_sp0(struct tss_struct *tss,
875 struct thread_struct *thread) 1024 struct thread_struct *thread)
876{ 1025{
@@ -884,7 +1033,8 @@ static void lguest_set_debugreg(int regno, unsigned long value)
884 /* FIXME: Implement */ 1033 /* FIXME: Implement */
885} 1034}
886 1035
887/* There are times when the kernel wants to make sure that no memory writes are 1036/*
1037 * There are times when the kernel wants to make sure that no memory writes are
888 * caught in the cache (that they've all reached real hardware devices). This 1038 * caught in the cache (that they've all reached real hardware devices). This
889 * doesn't matter for the Guest which has virtual hardware. 1039 * doesn't matter for the Guest which has virtual hardware.
890 * 1040 *
@@ -898,11 +1048,13 @@ static void lguest_wbinvd(void)
898{ 1048{
899} 1049}
900 1050
901/* If the Guest expects to have an Advanced Programmable Interrupt Controller, 1051/*
1052 * If the Guest expects to have an Advanced Programmable Interrupt Controller,
902 * we play dumb by ignoring writes and returning 0 for reads. So it's no 1053 * we play dumb by ignoring writes and returning 0 for reads. So it's no
903 * longer Programmable nor Controlling anything, and I don't think 8 lines of 1054 * longer Programmable nor Controlling anything, and I don't think 8 lines of
904 * code qualifies for Advanced. It will also never interrupt anything. It 1055 * code qualifies for Advanced. It will also never interrupt anything. It
905 * does, however, allow us to get through the Linux boot code. */ 1056 * does, however, allow us to get through the Linux boot code.
1057 */
906#ifdef CONFIG_X86_LOCAL_APIC 1058#ifdef CONFIG_X86_LOCAL_APIC
907static void lguest_apic_write(u32 reg, u32 v) 1059static void lguest_apic_write(u32 reg, u32 v)
908{ 1060{
@@ -951,11 +1103,13 @@ static void lguest_safe_halt(void)
951 kvm_hypercall0(LHCALL_HALT); 1103 kvm_hypercall0(LHCALL_HALT);
952} 1104}
953 1105
954/* The SHUTDOWN hypercall takes a string to describe what's happening, and 1106/*
1107 * The SHUTDOWN hypercall takes a string to describe what's happening, and
955 * an argument which says whether this to restart (reboot) the Guest or not. 1108 * an argument which says whether this to restart (reboot) the Guest or not.
956 * 1109 *
957 * Note that the Host always prefers that the Guest speak in physical addresses 1110 * Note that the Host always prefers that the Guest speak in physical addresses
958 * rather than virtual addresses, so we use __pa() here. */ 1111 * rather than virtual addresses, so we use __pa() here.
1112 */
959static void lguest_power_off(void) 1113static void lguest_power_off(void)
960{ 1114{
961 kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), 1115 kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"),
@@ -986,8 +1140,10 @@ static __init char *lguest_memory_setup(void)
986 * nice to move it back to lguest_init. Patch welcome... */ 1140 * nice to move it back to lguest_init. Patch welcome... */
987 atomic_notifier_chain_register(&panic_notifier_list, &paniced); 1141 atomic_notifier_chain_register(&panic_notifier_list, &paniced);
988 1142
989 /* The Linux bootloader header contains an "e820" memory map: the 1143 /*
990 * Launcher populated the first entry with our memory limit. */ 1144 *The Linux bootloader header contains an "e820" memory map: the
1145 * Launcher populated the first entry with our memory limit.
1146 */
991 e820_add_region(boot_params.e820_map[0].addr, 1147 e820_add_region(boot_params.e820_map[0].addr,
992 boot_params.e820_map[0].size, 1148 boot_params.e820_map[0].size,
993 boot_params.e820_map[0].type); 1149 boot_params.e820_map[0].type);
@@ -996,16 +1152,17 @@ static __init char *lguest_memory_setup(void)
996 return "LGUEST"; 1152 return "LGUEST";
997} 1153}
998 1154
999/* We will eventually use the virtio console device to produce console output, 1155/*
1156 * We will eventually use the virtio console device to produce console output,
1000 * but before that is set up we use LHCALL_NOTIFY on normal memory to produce 1157 * but before that is set up we use LHCALL_NOTIFY on normal memory to produce
1001 * console output. */ 1158 * console output.
1159 */
1002static __init int early_put_chars(u32 vtermno, const char *buf, int count) 1160static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1003{ 1161{
1004 char scratch[17]; 1162 char scratch[17];
1005 unsigned int len = count; 1163 unsigned int len = count;
1006 1164
1007 /* We use a nul-terminated string, so we have to make a copy. Icky, 1165 /* We use a nul-terminated string, so we make a copy. Icky, huh? */
1008 * huh? */
1009 if (len > sizeof(scratch) - 1) 1166 if (len > sizeof(scratch) - 1)
1010 len = sizeof(scratch) - 1; 1167 len = sizeof(scratch) - 1;
1011 scratch[len] = '\0'; 1168 scratch[len] = '\0';
@@ -1016,8 +1173,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1016 return len; 1173 return len;
1017} 1174}
1018 1175
1019/* Rebooting also tells the Host we're finished, but the RESTART flag tells the 1176/*
1020 * Launcher to reboot us. */ 1177 * Rebooting also tells the Host we're finished, but the RESTART flag tells the
1178 * Launcher to reboot us.
1179 */
1021static void lguest_restart(char *reason) 1180static void lguest_restart(char *reason)
1022{ 1181{
1023 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); 1182 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART);
@@ -1044,7 +1203,8 @@ static void lguest_restart(char *reason)
1044 * fit comfortably. 1203 * fit comfortably.
1045 * 1204 *
1046 * First we need assembly templates of each of the patchable Guest operations, 1205 * First we need assembly templates of each of the patchable Guest operations,
1047 * and these are in i386_head.S. */ 1206 * and these are in i386_head.S.
1207 */
1048 1208
1049/*G:060 We construct a table from the assembler templates: */ 1209/*G:060 We construct a table from the assembler templates: */
1050static const struct lguest_insns 1210static const struct lguest_insns
@@ -1055,9 +1215,11 @@ static const struct lguest_insns
1055 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, 1215 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
1056}; 1216};
1057 1217
1058/* Now our patch routine is fairly simple (based on the native one in 1218/*
1219 * Now our patch routine is fairly simple (based on the native one in
1059 * paravirt.c). If we have a replacement, we copy it in and return how much of 1220 * paravirt.c). If we have a replacement, we copy it in and return how much of
1060 * the available space we used. */ 1221 * the available space we used.
1222 */
1061static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, 1223static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
1062 unsigned long addr, unsigned len) 1224 unsigned long addr, unsigned len)
1063{ 1225{
@@ -1069,8 +1231,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
1069 1231
1070 insn_len = lguest_insns[type].end - lguest_insns[type].start; 1232 insn_len = lguest_insns[type].end - lguest_insns[type].start;
1071 1233
1072 /* Similarly if we can't fit replacement (shouldn't happen, but let's 1234 /* Similarly if it can't fit (doesn't happen, but let's be thorough). */
1073 * be thorough). */
1074 if (len < insn_len) 1235 if (len < insn_len)
1075 return paravirt_patch_default(type, clobber, ibuf, addr, len); 1236 return paravirt_patch_default(type, clobber, ibuf, addr, len);
1076 1237
@@ -1079,22 +1240,28 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
1079 return insn_len; 1240 return insn_len;
1080} 1241}
1081 1242
1082/*G:030 Once we get to lguest_init(), we know we're a Guest. The various 1243/*G:029
1244 * Once we get to lguest_init(), we know we're a Guest. The various
1083 * pv_ops structures in the kernel provide points for (almost) every routine we 1245 * pv_ops structures in the kernel provide points for (almost) every routine we
1084 * have to override to avoid privileged instructions. */ 1246 * have to override to avoid privileged instructions.
1247 */
1085__init void lguest_init(void) 1248__init void lguest_init(void)
1086{ 1249{
1087 /* We're under lguest, paravirt is enabled, and we're running at 1250 /* We're under lguest. */
1088 * privilege level 1, not 0 as normal. */
1089 pv_info.name = "lguest"; 1251 pv_info.name = "lguest";
1252 /* Paravirt is enabled. */
1090 pv_info.paravirt_enabled = 1; 1253 pv_info.paravirt_enabled = 1;
1254 /* We're running at privilege level 1, not 0 as normal. */
1091 pv_info.kernel_rpl = 1; 1255 pv_info.kernel_rpl = 1;
1256 /* Everyone except Xen runs with this set. */
1092 pv_info.shared_kernel_pmd = 1; 1257 pv_info.shared_kernel_pmd = 1;
1093 1258
1094 /* We set up all the lguest overrides for sensitive operations. These 1259 /*
1095 * are detailed with the operations themselves. */ 1260 * We set up all the lguest overrides for sensitive operations. These
1261 * are detailed with the operations themselves.
1262 */
1096 1263
1097 /* interrupt-related operations */ 1264 /* Interrupt-related operations */
1098 pv_irq_ops.init_IRQ = lguest_init_IRQ; 1265 pv_irq_ops.init_IRQ = lguest_init_IRQ;
1099 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); 1266 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
1100 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); 1267 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
@@ -1102,11 +1269,11 @@ __init void lguest_init(void)
1102 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); 1269 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
1103 pv_irq_ops.safe_halt = lguest_safe_halt; 1270 pv_irq_ops.safe_halt = lguest_safe_halt;
1104 1271
1105 /* init-time operations */ 1272 /* Setup operations */
1106 pv_init_ops.memory_setup = lguest_memory_setup; 1273 pv_init_ops.memory_setup = lguest_memory_setup;
1107 pv_init_ops.patch = lguest_patch; 1274 pv_init_ops.patch = lguest_patch;
1108 1275
1109 /* Intercepts of various cpu instructions */ 1276 /* Intercepts of various CPU instructions */
1110 pv_cpu_ops.load_gdt = lguest_load_gdt; 1277 pv_cpu_ops.load_gdt = lguest_load_gdt;
1111 pv_cpu_ops.cpuid = lguest_cpuid; 1278 pv_cpu_ops.cpuid = lguest_cpuid;
1112 pv_cpu_ops.load_idt = lguest_load_idt; 1279 pv_cpu_ops.load_idt = lguest_load_idt;
@@ -1127,7 +1294,7 @@ __init void lguest_init(void)
1127 pv_cpu_ops.start_context_switch = paravirt_start_context_switch; 1294 pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
1128 pv_cpu_ops.end_context_switch = lguest_end_context_switch; 1295 pv_cpu_ops.end_context_switch = lguest_end_context_switch;
1129 1296
1130 /* pagetable management */ 1297 /* Pagetable management */
1131 pv_mmu_ops.write_cr3 = lguest_write_cr3; 1298 pv_mmu_ops.write_cr3 = lguest_write_cr3;
1132 pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; 1299 pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
1133 pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; 1300 pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
@@ -1149,54 +1316,71 @@ __init void lguest_init(void)
1149 pv_mmu_ops.pte_update_defer = lguest_pte_update; 1316 pv_mmu_ops.pte_update_defer = lguest_pte_update;
1150 1317
1151#ifdef CONFIG_X86_LOCAL_APIC 1318#ifdef CONFIG_X86_LOCAL_APIC
1152 /* apic read/write intercepts */ 1319 /* APIC read/write intercepts */
1153 set_lguest_basic_apic_ops(); 1320 set_lguest_basic_apic_ops();
1154#endif 1321#endif
1155 1322
1156 /* time operations */ 1323 /* Time operations */
1157 pv_time_ops.get_wallclock = lguest_get_wallclock; 1324 pv_time_ops.get_wallclock = lguest_get_wallclock;
1158 pv_time_ops.time_init = lguest_time_init; 1325 pv_time_ops.time_init = lguest_time_init;
1159 pv_time_ops.get_tsc_khz = lguest_tsc_khz; 1326 pv_time_ops.get_tsc_khz = lguest_tsc_khz;
1160 1327
1161 /* Now is a good time to look at the implementations of these functions 1328 /*
1162 * before returning to the rest of lguest_init(). */ 1329 * Now is a good time to look at the implementations of these functions
1330 * before returning to the rest of lguest_init().
1331 */
1163 1332
1164 /*G:070 Now we've seen all the paravirt_ops, we return to 1333 /*G:070
1334 * Now we've seen all the paravirt_ops, we return to
1165 * lguest_init() where the rest of the fairly chaotic boot setup 1335 * lguest_init() where the rest of the fairly chaotic boot setup
1166 * occurs. */ 1336 * occurs.
1337 */
1167 1338
1168 /* The stack protector is a weird thing where gcc places a canary 1339 /*
1340 * The stack protector is a weird thing where gcc places a canary
1169 * value on the stack and then checks it on return. This file is 1341 * value on the stack and then checks it on return. This file is
1170 * compiled with -fno-stack-protector it, so we got this far without 1342 * compiled with -fno-stack-protector it, so we got this far without
1171 * problems. The value of the canary is kept at offset 20 from the 1343 * problems. The value of the canary is kept at offset 20 from the
1172 * %gs register, so we need to set that up before calling C functions 1344 * %gs register, so we need to set that up before calling C functions
1173 * in other files. */ 1345 * in other files.
1346 */
1174 setup_stack_canary_segment(0); 1347 setup_stack_canary_segment(0);
1175 /* We could just call load_stack_canary_segment(), but we might as 1348
1176 * call switch_to_new_gdt() which loads the whole table and sets up 1349 /*
1177 * the per-cpu segment descriptor register %fs as well. */ 1350 * We could just call load_stack_canary_segment(), but we might as well
1351 * call switch_to_new_gdt() which loads the whole table and sets up the
1352 * per-cpu segment descriptor register %fs as well.
1353 */
1178 switch_to_new_gdt(0); 1354 switch_to_new_gdt(0);
1179 1355
1180 /* As described in head_32.S, we map the first 128M of memory. */ 1356 /* We actually boot with all memory mapped, but let's say 128MB. */
1181 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1357 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1182 1358
1183 /* The Host<->Guest Switcher lives at the top of our address space, and 1359 /*
1360 * The Host<->Guest Switcher lives at the top of our address space, and
1184 * the Host told us how big it is when we made LGUEST_INIT hypercall: 1361 * the Host told us how big it is when we made LGUEST_INIT hypercall:
1185 * it put the answer in lguest_data.reserve_mem */ 1362 * it put the answer in lguest_data.reserve_mem
1363 */
1186 reserve_top_address(lguest_data.reserve_mem); 1364 reserve_top_address(lguest_data.reserve_mem);
1187 1365
1188 /* If we don't initialize the lock dependency checker now, it crashes 1366 /*
1189 * paravirt_disable_iospace. */ 1367 * If we don't initialize the lock dependency checker now, it crashes
1368 * paravirt_disable_iospace.
1369 */
1190 lockdep_init(); 1370 lockdep_init();
1191 1371
1192 /* The IDE code spends about 3 seconds probing for disks: if we reserve 1372 /*
1373 * The IDE code spends about 3 seconds probing for disks: if we reserve
1193 * all the I/O ports up front it can't get them and so doesn't probe. 1374 * all the I/O ports up front it can't get them and so doesn't probe.
1194 * Other device drivers are similar (but less severe). This cuts the 1375 * Other device drivers are similar (but less severe). This cuts the
1195 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ 1376 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds.
1377 */
1196 paravirt_disable_iospace(); 1378 paravirt_disable_iospace();
1197 1379
1198 /* This is messy CPU setup stuff which the native boot code does before 1380 /*
1199 * start_kernel, so we have to do, too: */ 1381 * This is messy CPU setup stuff which the native boot code does before
1382 * start_kernel, so we have to do, too:
1383 */
1200 cpu_detect(&new_cpu_data); 1384 cpu_detect(&new_cpu_data);
1201 /* head.S usually sets up the first capability word, so do it here. */ 1385 /* head.S usually sets up the first capability word, so do it here. */
1202 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1386 new_cpu_data.x86_capability[0] = cpuid_edx(1);
@@ -1213,22 +1397,28 @@ __init void lguest_init(void)
1213 acpi_ht = 0; 1397 acpi_ht = 0;
1214#endif 1398#endif
1215 1399
1216 /* We set the preferred console to "hvc". This is the "hypervisor 1400 /*
1401 * We set the preferred console to "hvc". This is the "hypervisor
1217 * virtual console" driver written by the PowerPC people, which we also 1402 * virtual console" driver written by the PowerPC people, which we also
1218 * adapted for lguest's use. */ 1403 * adapted for lguest's use.
1404 */
1219 add_preferred_console("hvc", 0, NULL); 1405 add_preferred_console("hvc", 0, NULL);
1220 1406
1221 /* Register our very early console. */ 1407 /* Register our very early console. */
1222 virtio_cons_early_init(early_put_chars); 1408 virtio_cons_early_init(early_put_chars);
1223 1409
1224 /* Last of all, we set the power management poweroff hook to point to 1410 /*
1411 * Last of all, we set the power management poweroff hook to point to
1225 * the Guest routine to power off, and the reboot hook to our restart 1412 * the Guest routine to power off, and the reboot hook to our restart
1226 * routine. */ 1413 * routine.
1414 */
1227 pm_power_off = lguest_power_off; 1415 pm_power_off = lguest_power_off;
1228 machine_ops.restart = lguest_restart; 1416 machine_ops.restart = lguest_restart;
1229 1417
1230 /* Now we're set up, call i386_start_kernel() in head32.c and we proceed 1418 /*
1231 * to boot as normal. It never returns. */ 1419 * Now we're set up, call i386_start_kernel() in head32.c and we proceed
1420 * to boot as normal. It never returns.
1421 */
1232 i386_start_kernel(); 1422 i386_start_kernel();
1233} 1423}
1234/* 1424/*
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index a9c8cfe61cd4..27eac0faee48 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -5,7 +5,8 @@
5#include <asm/thread_info.h> 5#include <asm/thread_info.h>
6#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
7 7
8/*G:020 Our story starts with the kernel booting into startup_32 in 8/*G:020
9 * Our story starts with the kernel booting into startup_32 in
9 * arch/x86/kernel/head_32.S. It expects a boot header, which is created by 10 * arch/x86/kernel/head_32.S. It expects a boot header, which is created by
10 * the bootloader (the Launcher in our case). 11 * the bootloader (the Launcher in our case).
11 * 12 *
@@ -21,11 +22,14 @@
21 * data without remembering to subtract __PAGE_OFFSET! 22 * data without remembering to subtract __PAGE_OFFSET!
22 * 23 *
23 * The .section line puts this code in .init.text so it will be discarded after 24 * The .section line puts this code in .init.text so it will be discarded after
24 * boot. */ 25 * boot.
26 */
25.section .init.text, "ax", @progbits 27.section .init.text, "ax", @progbits
26ENTRY(lguest_entry) 28ENTRY(lguest_entry)
27 /* We make the "initialization" hypercall now to tell the Host about 29 /*
28 * us, and also find out where it put our page tables. */ 30 * We make the "initialization" hypercall now to tell the Host about
31 * us, and also find out where it put our page tables.
32 */
29 movl $LHCALL_LGUEST_INIT, %eax 33 movl $LHCALL_LGUEST_INIT, %eax
30 movl $lguest_data - __PAGE_OFFSET, %ebx 34 movl $lguest_data - __PAGE_OFFSET, %ebx
31 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 35 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
@@ -33,13 +37,14 @@ ENTRY(lguest_entry)
33 /* Set up the initial stack so we can run C code. */ 37 /* Set up the initial stack so we can run C code. */
34 movl $(init_thread_union+THREAD_SIZE),%esp 38 movl $(init_thread_union+THREAD_SIZE),%esp
35 39
36 /* Jumps are relative, and we're running __PAGE_OFFSET too low at the 40 /* Jumps are relative: we're running __PAGE_OFFSET too low. */
37 * moment. */
38 jmp lguest_init+__PAGE_OFFSET 41 jmp lguest_init+__PAGE_OFFSET
39 42
40/*G:055 We create a macro which puts the assembler code between lgstart_ and 43/*G:055
41 * lgend_ markers. These templates are put in the .text section: they can't be 44 * We create a macro which puts the assembler code between lgstart_ and lgend_
42 * discarded after boot as we may need to patch modules, too. */ 45 * markers. These templates are put in the .text section: they can't be
46 * discarded after boot as we may need to patch modules, too.
47 */
43.text 48.text
44#define LGUEST_PATCH(name, insns...) \ 49#define LGUEST_PATCH(name, insns...) \
45 lgstart_##name: insns; lgend_##name:; \ 50 lgstart_##name: insns; lgend_##name:; \
@@ -48,83 +53,103 @@ ENTRY(lguest_entry)
48LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) 53LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
49LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) 54LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
50 55
51/*G:033 But using those wrappers is inefficient (we'll see why that doesn't 56/*G:033
52 * matter for save_fl and irq_disable later). If we write our routines 57 * But using those wrappers is inefficient (we'll see why that doesn't matter
53 * carefully in assembler, we can avoid clobbering any registers and avoid 58 * for save_fl and irq_disable later). If we write our routines carefully in
54 * jumping through the wrapper functions. 59 * assembler, we can avoid clobbering any registers and avoid jumping through
60 * the wrapper functions.
55 * 61 *
56 * I skipped over our first piece of assembler, but this one is worth studying 62 * I skipped over our first piece of assembler, but this one is worth studying
57 * in a bit more detail so I'll describe in easy stages. First, the routine 63 * in a bit more detail so I'll describe in easy stages. First, the routine to
58 * to enable interrupts: */ 64 * enable interrupts:
65 */
59ENTRY(lg_irq_enable) 66ENTRY(lg_irq_enable)
60 /* The reverse of irq_disable, this sets lguest_data.irq_enabled to 67 /*
61 * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ 68 * The reverse of irq_disable, this sets lguest_data.irq_enabled to
69 * X86_EFLAGS_IF (ie. "Interrupts enabled").
70 */
62 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled 71 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
63 /* But now we need to check if the Host wants to know: there might have 72 /*
73 * But now we need to check if the Host wants to know: there might have
64 * been interrupts waiting to be delivered, in which case it will have 74 * been interrupts waiting to be delivered, in which case it will have
65 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we 75 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
66 * jump to send_interrupts, otherwise we're done. */ 76 * jump to send_interrupts, otherwise we're done.
77 */
67 testl $0, lguest_data+LGUEST_DATA_irq_pending 78 testl $0, lguest_data+LGUEST_DATA_irq_pending
68 jnz send_interrupts 79 jnz send_interrupts
69 /* One cool thing about x86 is that you can do many things without using 80 /*
81 * One cool thing about x86 is that you can do many things without using
70 * a register. In this case, the normal path hasn't needed to save or 82 * a register. In this case, the normal path hasn't needed to save or
71 * restore any registers at all! */ 83 * restore any registers at all!
84 */
72 ret 85 ret
73send_interrupts: 86send_interrupts:
74 /* OK, now we need a register: eax is used for the hypercall number, 87 /*
88 * OK, now we need a register: eax is used for the hypercall number,
75 * which is LHCALL_SEND_INTERRUPTS. 89 * which is LHCALL_SEND_INTERRUPTS.
76 * 90 *
77 * We used not to bother with this pending detection at all, which was 91 * We used not to bother with this pending detection at all, which was
78 * much simpler. Sooner or later the Host would realize it had to 92 * much simpler. Sooner or later the Host would realize it had to
79 * send us an interrupt. But that turns out to make performance 7 93 * send us an interrupt. But that turns out to make performance 7
80 * times worse on a simple tcp benchmark. So now we do this the hard 94 * times worse on a simple tcp benchmark. So now we do this the hard
81 * way. */ 95 * way.
96 */
82 pushl %eax 97 pushl %eax
83 movl $LHCALL_SEND_INTERRUPTS, %eax 98 movl $LHCALL_SEND_INTERRUPTS, %eax
84 /* This is a vmcall instruction (same thing that KVM uses). Older 99 /*
100 * This is a vmcall instruction (same thing that KVM uses). Older
85 * assembler versions might not know the "vmcall" instruction, so we 101 * assembler versions might not know the "vmcall" instruction, so we
86 * create one manually here. */ 102 * create one manually here.
103 */
87 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 104 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
105 /* Put eax back the way we found it. */
88 popl %eax 106 popl %eax
89 ret 107 ret
90 108
91/* Finally, the "popf" or "restore flags" routine. The %eax register holds the 109/*
110 * Finally, the "popf" or "restore flags" routine. The %eax register holds the
92 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're 111 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
93 * enabling interrupts again, if it's 0 we're leaving them off. */ 112 * enabling interrupts again, if it's 0 we're leaving them off.
113 */
94ENTRY(lg_restore_fl) 114ENTRY(lg_restore_fl)
95 /* This is just "lguest_data.irq_enabled = flags;" */ 115 /* This is just "lguest_data.irq_enabled = flags;" */
96 movl %eax, lguest_data+LGUEST_DATA_irq_enabled 116 movl %eax, lguest_data+LGUEST_DATA_irq_enabled
97 /* Now, if the %eax value has enabled interrupts and 117 /*
118 * Now, if the %eax value has enabled interrupts and
98 * lguest_data.irq_pending is set, we want to tell the Host so it can 119 * lguest_data.irq_pending is set, we want to tell the Host so it can
99 * deliver any outstanding interrupts. Fortunately, both values will 120 * deliver any outstanding interrupts. Fortunately, both values will
100 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" 121 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
101 * instruction will AND them together for us. If both are set, we 122 * instruction will AND them together for us. If both are set, we
102 * jump to send_interrupts. */ 123 * jump to send_interrupts.
124 */
103 testl lguest_data+LGUEST_DATA_irq_pending, %eax 125 testl lguest_data+LGUEST_DATA_irq_pending, %eax
104 jnz send_interrupts 126 jnz send_interrupts
105 /* Again, the normal path has used no extra registers. Clever, huh? */ 127 /* Again, the normal path has used no extra registers. Clever, huh? */
106 ret 128 ret
129/*:*/
107 130
108/* These demark the EIP range where host should never deliver interrupts. */ 131/* These demark the EIP range where host should never deliver interrupts. */
109.global lguest_noirq_start 132.global lguest_noirq_start
110.global lguest_noirq_end 133.global lguest_noirq_end
111 134
112/*M:004 When the Host reflects a trap or injects an interrupt into the Guest, 135/*M:004
113 * it sets the eflags interrupt bit on the stack based on 136 * When the Host reflects a trap or injects an interrupt into the Guest, it
114 * lguest_data.irq_enabled, so the Guest iret logic does the right thing when 137 * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
115 * restoring it. However, when the Host sets the Guest up for direct traps, 138 * so the Guest iret logic does the right thing when restoring it. However,
116 * such as system calls, the processor is the one to push eflags onto the 139 * when the Host sets the Guest up for direct traps, such as system calls, the
117 * stack, and the interrupt bit will be 1 (in reality, interrupts are always 140 * processor is the one to push eflags onto the stack, and the interrupt bit
118 * enabled in the Guest). 141 * will be 1 (in reality, interrupts are always enabled in the Guest).
119 * 142 *
120 * This turns out to be harmless: the only trap which should happen under Linux 143 * This turns out to be harmless: the only trap which should happen under Linux
121 * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc 144 * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
122 * regions), which has to be reflected through the Host anyway. If another 145 * regions), which has to be reflected through the Host anyway. If another
123 * trap *does* go off when interrupts are disabled, the Guest will panic, and 146 * trap *does* go off when interrupts are disabled, the Guest will panic, and
124 * we'll never get to this iret! :*/ 147 * we'll never get to this iret!
148:*/
125 149
126/*G:045 There is one final paravirt_op that the Guest implements, and glancing 150/*G:045
127 * at it you can see why I left it to last. It's *cool*! It's in *assembler*! 151 * There is one final paravirt_op that the Guest implements, and glancing at it
152 * you can see why I left it to last. It's *cool*! It's in *assembler*!
128 * 153 *
129 * The "iret" instruction is used to return from an interrupt or trap. The 154 * The "iret" instruction is used to return from an interrupt or trap. The
130 * stack looks like this: 155 * stack looks like this:
@@ -148,15 +173,18 @@ ENTRY(lg_restore_fl)
148 * return to userspace or wherever. Our solution to this is to surround the 173 * return to userspace or wherever. Our solution to this is to surround the
149 * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the 174 * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the
150 * Host that it is *never* to interrupt us there, even if interrupts seem to be 175 * Host that it is *never* to interrupt us there, even if interrupts seem to be
151 * enabled. */ 176 * enabled.
177 */
152ENTRY(lguest_iret) 178ENTRY(lguest_iret)
153 pushl %eax 179 pushl %eax
154 movl 12(%esp), %eax 180 movl 12(%esp), %eax
155lguest_noirq_start: 181lguest_noirq_start:
156 /* Note the %ss: segment prefix here. Normal data accesses use the 182 /*
183 * Note the %ss: segment prefix here. Normal data accesses use the
157 * "ds" segment, but that will have already been restored for whatever 184 * "ds" segment, but that will have already been restored for whatever
158 * we're returning to (such as userspace): we can't trust it. The %ss: 185 * we're returning to (such as userspace): we can't trust it. The %ss:
159 * prefix makes sure we use the stack segment, which is still valid. */ 186 * prefix makes sure we use the stack segment, which is still valid.
187 */
160 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled 188 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled
161 popl %eax 189 popl %eax
162 iret 190 iret
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f9d35632666b..07c31899c9c2 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -10,6 +10,7 @@ lib-y += usercopy_$(BITS).o getuser.o putuser.o
10lib-y += memcpy_$(BITS).o 10lib-y += memcpy_$(BITS).o
11 11
12ifeq ($(CONFIG_X86_32),y) 12ifeq ($(CONFIG_X86_32),y)
13 obj-y += atomic64_32.o
13 lib-y += checksum_32.o 14 lib-y += checksum_32.o
14 lib-y += strstr_32.o 15 lib-y += strstr_32.o
15 lib-y += semaphore_32.o string_32.o 16 lib-y += semaphore_32.o string_32.o
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
new file mode 100644
index 000000000000..824fa0be55a3
--- /dev/null
+++ b/arch/x86/lib/atomic64_32.c
@@ -0,0 +1,230 @@
1#include <linux/compiler.h>
2#include <linux/module.h>
3#include <linux/types.h>
4
5#include <asm/processor.h>
6#include <asm/cmpxchg.h>
7#include <asm/atomic.h>
8
9static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new)
10{
11 u32 low = new;
12 u32 high = new >> 32;
13
14 asm volatile(
15 LOCK_PREFIX "cmpxchg8b %1\n"
16 : "+A" (old), "+m" (*ptr)
17 : "b" (low), "c" (high)
18 );
19 return old;
20}
21
22u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val)
23{
24 return cmpxchg8b(&ptr->counter, old_val, new_val);
25}
26EXPORT_SYMBOL(atomic64_cmpxchg);
27
28/**
29 * atomic64_xchg - xchg atomic64 variable
30 * @ptr: pointer to type atomic64_t
31 * @new_val: value to assign
32 *
33 * Atomically xchgs the value of @ptr to @new_val and returns
34 * the old value.
35 */
36u64 atomic64_xchg(atomic64_t *ptr, u64 new_val)
37{
38 /*
39 * Try first with a (possibly incorrect) assumption about
40 * what we have there. We'll do two loops most likely,
41 * but we'll get an ownership MESI transaction straight away
42 * instead of a read transaction followed by a
43 * flush-for-ownership transaction:
44 */
45 u64 old_val, real_val = 0;
46
47 do {
48 old_val = real_val;
49
50 real_val = atomic64_cmpxchg(ptr, old_val, new_val);
51
52 } while (real_val != old_val);
53
54 return old_val;
55}
56EXPORT_SYMBOL(atomic64_xchg);
57
58/**
59 * atomic64_set - set atomic64 variable
60 * @ptr: pointer to type atomic64_t
61 * @new_val: value to assign
62 *
63 * Atomically sets the value of @ptr to @new_val.
64 */
65void atomic64_set(atomic64_t *ptr, u64 new_val)
66{
67 atomic64_xchg(ptr, new_val);
68}
69EXPORT_SYMBOL(atomic64_set);
70
71/**
72EXPORT_SYMBOL(atomic64_read);
73 * atomic64_add_return - add and return
74 * @delta: integer value to add
75 * @ptr: pointer to type atomic64_t
76 *
77 * Atomically adds @delta to @ptr and returns @delta + *@ptr
78 */
79noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr)
80{
81 /*
82 * Try first with a (possibly incorrect) assumption about
83 * what we have there. We'll do two loops most likely,
84 * but we'll get an ownership MESI transaction straight away
85 * instead of a read transaction followed by a
86 * flush-for-ownership transaction:
87 */
88 u64 old_val, new_val, real_val = 0;
89
90 do {
91 old_val = real_val;
92 new_val = old_val + delta;
93
94 real_val = atomic64_cmpxchg(ptr, old_val, new_val);
95
96 } while (real_val != old_val);
97
98 return new_val;
99}
100EXPORT_SYMBOL(atomic64_add_return);
101
102u64 atomic64_sub_return(u64 delta, atomic64_t *ptr)
103{
104 return atomic64_add_return(-delta, ptr);
105}
106EXPORT_SYMBOL(atomic64_sub_return);
107
108u64 atomic64_inc_return(atomic64_t *ptr)
109{
110 return atomic64_add_return(1, ptr);
111}
112EXPORT_SYMBOL(atomic64_inc_return);
113
114u64 atomic64_dec_return(atomic64_t *ptr)
115{
116 return atomic64_sub_return(1, ptr);
117}
118EXPORT_SYMBOL(atomic64_dec_return);
119
120/**
121 * atomic64_add - add integer to atomic64 variable
122 * @delta: integer value to add
123 * @ptr: pointer to type atomic64_t
124 *
125 * Atomically adds @delta to @ptr.
126 */
127void atomic64_add(u64 delta, atomic64_t *ptr)
128{
129 atomic64_add_return(delta, ptr);
130}
131EXPORT_SYMBOL(atomic64_add);
132
133/**
134 * atomic64_sub - subtract the atomic64 variable
135 * @delta: integer value to subtract
136 * @ptr: pointer to type atomic64_t
137 *
138 * Atomically subtracts @delta from @ptr.
139 */
140void atomic64_sub(u64 delta, atomic64_t *ptr)
141{
142 atomic64_add(-delta, ptr);
143}
144EXPORT_SYMBOL(atomic64_sub);
145
146/**
147 * atomic64_sub_and_test - subtract value from variable and test result
148 * @delta: integer value to subtract
149 * @ptr: pointer to type atomic64_t
150 *
151 * Atomically subtracts @delta from @ptr and returns
152 * true if the result is zero, or false for all
153 * other cases.
154 */
155int atomic64_sub_and_test(u64 delta, atomic64_t *ptr)
156{
157 u64 new_val = atomic64_sub_return(delta, ptr);
158
159 return new_val == 0;
160}
161EXPORT_SYMBOL(atomic64_sub_and_test);
162
163/**
164 * atomic64_inc - increment atomic64 variable
165 * @ptr: pointer to type atomic64_t
166 *
167 * Atomically increments @ptr by 1.
168 */
169void atomic64_inc(atomic64_t *ptr)
170{
171 atomic64_add(1, ptr);
172}
173EXPORT_SYMBOL(atomic64_inc);
174
175/**
176 * atomic64_dec - decrement atomic64 variable
177 * @ptr: pointer to type atomic64_t
178 *
179 * Atomically decrements @ptr by 1.
180 */
181void atomic64_dec(atomic64_t *ptr)
182{
183 atomic64_sub(1, ptr);
184}
185EXPORT_SYMBOL(atomic64_dec);
186
187/**
188 * atomic64_dec_and_test - decrement and test
189 * @ptr: pointer to type atomic64_t
190 *
191 * Atomically decrements @ptr by 1 and
192 * returns true if the result is 0, or false for all other
193 * cases.
194 */
195int atomic64_dec_and_test(atomic64_t *ptr)
196{
197 return atomic64_sub_and_test(1, ptr);
198}
199EXPORT_SYMBOL(atomic64_dec_and_test);
200
201/**
202 * atomic64_inc_and_test - increment and test
203 * @ptr: pointer to type atomic64_t
204 *
205 * Atomically increments @ptr by 1
206 * and returns true if the result is zero, or false for all
207 * other cases.
208 */
209int atomic64_inc_and_test(atomic64_t *ptr)
210{
211 return atomic64_sub_and_test(-1, ptr);
212}
213EXPORT_SYMBOL(atomic64_inc_and_test);
214
215/**
216 * atomic64_add_negative - add and test if negative
217 * @delta: integer value to add
218 * @ptr: pointer to type atomic64_t
219 *
220 * Atomically adds @delta to @ptr and returns true
221 * if the result is negative, or false when
222 * result is greater than or equal to zero.
223 */
224int atomic64_add_negative(u64 delta, atomic64_t *ptr)
225{
226 s64 new_val = atomic64_add_return(delta, ptr);
227
228 return new_val < 0;
229}
230EXPORT_SYMBOL(atomic64_add_negative);
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 9a10a78bb4a4..ebeafcce04a9 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -5,15 +5,14 @@
5 * Zero a page. 5 * Zero a page.
6 * rdi page 6 * rdi page
7 */ 7 */
8 ALIGN 8ENTRY(clear_page_c)
9clear_page_c:
10 CFI_STARTPROC 9 CFI_STARTPROC
11 movl $4096/8,%ecx 10 movl $4096/8,%ecx
12 xorl %eax,%eax 11 xorl %eax,%eax
13 rep stosq 12 rep stosq
14 ret 13 ret
15 CFI_ENDPROC 14 CFI_ENDPROC
16ENDPROC(clear_page) 15ENDPROC(clear_page_c)
17 16
18ENTRY(clear_page) 17ENTRY(clear_page)
19 CFI_STARTPROC 18 CFI_STARTPROC
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index f118c110af32..6ba0f7bb85ea 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -75,6 +75,7 @@ ENTRY(copy_to_user)
75 jae bad_to_user 75 jae bad_to_user
76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
77 CFI_ENDPROC 77 CFI_ENDPROC
78ENDPROC(copy_to_user)
78 79
79/* Standard copy_from_user with segment limit checking */ 80/* Standard copy_from_user with segment limit checking */
80ENTRY(copy_from_user) 81ENTRY(copy_from_user)
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index f4568605d7d5..ff485d361182 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -55,8 +55,10 @@ static void delay_tsc(unsigned long loops)
55 55
56 preempt_disable(); 56 preempt_disable();
57 cpu = smp_processor_id(); 57 cpu = smp_processor_id();
58 rdtsc_barrier();
58 rdtscl(bclock); 59 rdtscl(bclock);
59 for (;;) { 60 for (;;) {
61 rdtsc_barrier();
60 rdtscl(now); 62 rdtscl(now);
61 if ((now - bclock) >= loops) 63 if ((now - bclock) >= loops)
62 break; 64 break;
@@ -78,6 +80,7 @@ static void delay_tsc(unsigned long loops)
78 if (unlikely(cpu != smp_processor_id())) { 80 if (unlikely(cpu != smp_processor_id())) {
79 loops -= (now - bclock); 81 loops -= (now - bclock);
80 cpu = smp_processor_id(); 82 cpu = smp_processor_id();
83 rdtsc_barrier();
81 rdtscl(bclock); 84 rdtscl(bclock);
82 } 85 }
83 } 86 }
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 1440b9c0547e..caa24aca8115 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -89,16 +89,13 @@ void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
89 rv.msrs = msrs; 89 rv.msrs = msrs;
90 rv.msr_no = msr_no; 90 rv.msr_no = msr_no;
91 91
92 preempt_disable(); 92 this_cpu = get_cpu();
93 /* 93
94 * FIXME: handle the CPU we're executing on separately for now until 94 if (cpumask_test_cpu(this_cpu, mask))
95 * smp_call_function_many has been fixed to not skip it. 95 __rdmsr_on_cpu(&rv);
96 */
97 this_cpu = raw_smp_processor_id();
98 smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1);
99 96
100 smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1); 97 smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
101 preempt_enable(); 98 put_cpu();
102} 99}
103EXPORT_SYMBOL(rdmsr_on_cpus); 100EXPORT_SYMBOL(rdmsr_on_cpus);
104 101
@@ -121,16 +118,13 @@ void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
121 rv.msrs = msrs; 118 rv.msrs = msrs;
122 rv.msr_no = msr_no; 119 rv.msr_no = msr_no;
123 120
124 preempt_disable(); 121 this_cpu = get_cpu();
125 /* 122
126 * FIXME: handle the CPU we're executing on separately for now until 123 if (cpumask_test_cpu(this_cpu, mask))
127 * smp_call_function_many has been fixed to not skip it. 124 __wrmsr_on_cpu(&rv);
128 */
129 this_cpu = raw_smp_processor_id();
130 smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1);
131 125
132 smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1); 126 smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
133 preempt_enable(); 127 put_cpu();
134} 128}
135EXPORT_SYMBOL(wrmsr_on_cpus); 129EXPORT_SYMBOL(wrmsr_on_cpus);
136 130
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 7c8ca91bb9ec..1f118d462acc 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -751,7 +751,7 @@ survive:
751 751
752 if (retval == -ENOMEM && is_global_init(current)) { 752 if (retval == -ENOMEM && is_global_init(current)) {
753 up_read(&current->mm->mmap_sem); 753 up_read(&current->mm->mmap_sem);
754 congestion_wait(WRITE, HZ/50); 754 congestion_wait(BLK_RW_ASYNC, HZ/50);
755 goto survive; 755 goto survive;
756 } 756 }
757 757
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 78a5fff857be..bfae139182ff 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -426,10 +426,11 @@ static noinline int vmalloc_fault(unsigned long address)
426} 426}
427 427
428static const char errata93_warning[] = 428static const char errata93_warning[] =
429KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" 429KERN_ERR
430KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" 430"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
431KERN_ERR "******* Please consider a BIOS update.\n" 431"******* Working around it, but it may cause SEGVs or burn power.\n"
432KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; 432"******* Please consider a BIOS update.\n"
433"******* Disabling USB legacy in the BIOS may also help.\n";
433 434
434/* 435/*
435 * No vm86 mode in 64-bit mode: 436 * No vm86 mode in 64-bit mode:
@@ -696,7 +697,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
696 if (!printk_ratelimit()) 697 if (!printk_ratelimit())
697 return; 698 return;
698 699
699 printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", 700 printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
700 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 701 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
701 tsk->comm, task_pid_nr(tsk), address, 702 tsk->comm, task_pid_nr(tsk), address,
702 (void *)regs->ip, (void *)regs->sp, error_code); 703 (void *)regs->ip, (void *)regs->sp, error_code);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 58f621e81919..2112ed55e7ea 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -103,6 +103,7 @@ EXPORT_SYMBOL(kmap);
103EXPORT_SYMBOL(kunmap); 103EXPORT_SYMBOL(kunmap);
104EXPORT_SYMBOL(kmap_atomic); 104EXPORT_SYMBOL(kmap_atomic);
105EXPORT_SYMBOL(kunmap_atomic); 105EXPORT_SYMBOL(kunmap_atomic);
106EXPORT_SYMBOL(kmap_atomic_prot);
106 107
107void __init set_highmem_pages_init(void) 108void __init set_highmem_pages_init(void)
108{ 109{
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index f53b57e4086f..0607119cef94 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -12,6 +12,7 @@
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/tlbflush.h> 13#include <asm/tlbflush.h>
14#include <asm/tlb.h> 14#include <asm/tlb.h>
15#include <asm/proto.h>
15 16
16DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 17DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
17 18
@@ -177,20 +178,6 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,
177 return nr_range; 178 return nr_range;
178} 179}
179 180
180#ifdef CONFIG_X86_64
181static void __init init_gbpages(void)
182{
183 if (direct_gbpages && cpu_has_gbpages)
184 printk(KERN_INFO "Using GB pages for direct mapping\n");
185 else
186 direct_gbpages = 0;
187}
188#else
189static inline void init_gbpages(void)
190{
191}
192#endif
193
194/* 181/*
195 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 182 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
196 * This runs before bootmem is initialized and gets pages directly from 183 * This runs before bootmem is initialized and gets pages directly from
@@ -210,9 +197,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
210 197
211 printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); 198 printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
212 199
213 if (!after_bootmem)
214 init_gbpages();
215
216#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) 200#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
217 /* 201 /*
218 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. 202 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index c4378f4fd4a5..6176fe8f29e0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -598,6 +598,15 @@ void __init paging_init(void)
598 598
599 sparse_memory_present_with_active_regions(MAX_NUMNODES); 599 sparse_memory_present_with_active_regions(MAX_NUMNODES);
600 sparse_init(); 600 sparse_init();
601
602 /*
603 * clear the default setting with node 0
604 * note: don't use nodes_clear here, that is really clearing when
605 * numa support is not compiled in, and later node_set_state
606 * will not set it back.
607 */
608 node_clear_state(0, N_NORMAL_MEMORY);
609
601 free_area_init_nodes(max_zone_pfns); 610 free_area_init_nodes(max_zone_pfns);
602} 611}
603 612
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 3cfe9ced8a4c..7e600c1962db 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -11,6 +11,7 @@
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/pfn.h>
14 15
15#include <asm/e820.h> 16#include <asm/e820.h>
16#include <asm/processor.h> 17#include <asm/processor.h>
@@ -590,9 +591,12 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
590 unsigned int level; 591 unsigned int level;
591 pte_t *kpte, old_pte; 592 pte_t *kpte, old_pte;
592 593
593 if (cpa->flags & CPA_PAGES_ARRAY) 594 if (cpa->flags & CPA_PAGES_ARRAY) {
594 address = (unsigned long)page_address(cpa->pages[cpa->curpage]); 595 struct page *page = cpa->pages[cpa->curpage];
595 else if (cpa->flags & CPA_ARRAY) 596 if (unlikely(PageHighMem(page)))
597 return 0;
598 address = (unsigned long)page_address(page);
599 } else if (cpa->flags & CPA_ARRAY)
596 address = cpa->vaddr[cpa->curpage]; 600 address = cpa->vaddr[cpa->curpage];
597 else 601 else
598 address = *cpa->vaddr; 602 address = *cpa->vaddr;
@@ -681,8 +685,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
681static int cpa_process_alias(struct cpa_data *cpa) 685static int cpa_process_alias(struct cpa_data *cpa)
682{ 686{
683 struct cpa_data alias_cpa; 687 struct cpa_data alias_cpa;
684 int ret = 0; 688 unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
685 unsigned long temp_cpa_vaddr, vaddr; 689 unsigned long vaddr, remapped;
690 int ret;
686 691
687 if (cpa->pfn >= max_pfn_mapped) 692 if (cpa->pfn >= max_pfn_mapped)
688 return 0; 693 return 0;
@@ -695,9 +700,12 @@ static int cpa_process_alias(struct cpa_data *cpa)
695 * No need to redo, when the primary call touched the direct 700 * No need to redo, when the primary call touched the direct
696 * mapping already: 701 * mapping already:
697 */ 702 */
698 if (cpa->flags & CPA_PAGES_ARRAY) 703 if (cpa->flags & CPA_PAGES_ARRAY) {
699 vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]); 704 struct page *page = cpa->pages[cpa->curpage];
700 else if (cpa->flags & CPA_ARRAY) 705 if (unlikely(PageHighMem(page)))
706 return 0;
707 vaddr = (unsigned long)page_address(page);
708 } else if (cpa->flags & CPA_ARRAY)
701 vaddr = cpa->vaddr[cpa->curpage]; 709 vaddr = cpa->vaddr[cpa->curpage];
702 else 710 else
703 vaddr = *cpa->vaddr; 711 vaddr = *cpa->vaddr;
@@ -706,42 +714,55 @@ static int cpa_process_alias(struct cpa_data *cpa)
706 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { 714 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
707 715
708 alias_cpa = *cpa; 716 alias_cpa = *cpa;
709 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 717 alias_cpa.vaddr = &laddr;
710 alias_cpa.vaddr = &temp_cpa_vaddr;
711 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); 718 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
712 719
713
714 ret = __change_page_attr_set_clr(&alias_cpa, 0); 720 ret = __change_page_attr_set_clr(&alias_cpa, 0);
721 if (ret)
722 return ret;
715 } 723 }
716 724
717#ifdef CONFIG_X86_64 725#ifdef CONFIG_X86_64
718 if (ret)
719 return ret;
720 /*
721 * No need to redo, when the primary call touched the high
722 * mapping already:
723 */
724 if (within(vaddr, (unsigned long) _text, _brk_end))
725 return 0;
726
727 /* 726 /*
728 * If the physical address is inside the kernel map, we need 727 * If the primary call didn't touch the high mapping already
728 * and the physical address is inside the kernel map, we need
729 * to touch the high mapped kernel as well: 729 * to touch the high mapped kernel as well:
730 */ 730 */
731 if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) 731 if (!within(vaddr, (unsigned long)_text, _brk_end) &&
732 return 0; 732 within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
733 unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
734 __START_KERNEL_map - phys_base;
735 alias_cpa = *cpa;
736 alias_cpa.vaddr = &temp_cpa_vaddr;
737 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
733 738
734 alias_cpa = *cpa; 739 /*
735 temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; 740 * The high mapping range is imprecise, so ignore the
736 alias_cpa.vaddr = &temp_cpa_vaddr; 741 * return value.
737 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); 742 */
743 __change_page_attr_set_clr(&alias_cpa, 0);
744 }
745#endif
738 746
739 /* 747 /*
740 * The high mapping range is imprecise, so ignore the return value. 748 * If the PMD page was partially used for per-cpu remapping,
749 * the recycled area needs to be split and modified. Because
750 * the area is always proper subset of a PMD page
751 * cpa->numpages is guaranteed to be 1 for these areas, so
752 * there's no need to loop over and check for further remaps.
741 */ 753 */
742 __change_page_attr_set_clr(&alias_cpa, 0); 754 remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr);
743#endif 755 if (remapped) {
744 return ret; 756 WARN_ON(cpa->numpages > 1);
757 alias_cpa = *cpa;
758 alias_cpa.vaddr = &remapped;
759 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
760 ret = __change_page_attr_set_clr(&alias_cpa, 0);
761 if (ret)
762 return ret;
763 }
764
765 return 0;
745} 766}
746 767
747static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) 768static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
@@ -982,12 +1003,15 @@ EXPORT_SYMBOL(set_memory_array_uc);
982int _set_memory_wc(unsigned long addr, int numpages) 1003int _set_memory_wc(unsigned long addr, int numpages)
983{ 1004{
984 int ret; 1005 int ret;
1006 unsigned long addr_copy = addr;
1007
985 ret = change_page_attr_set(&addr, numpages, 1008 ret = change_page_attr_set(&addr, numpages,
986 __pgprot(_PAGE_CACHE_UC_MINUS), 0); 1009 __pgprot(_PAGE_CACHE_UC_MINUS), 0);
987
988 if (!ret) { 1010 if (!ret) {
989 ret = change_page_attr_set(&addr, numpages, 1011 ret = change_page_attr_set_clr(&addr_copy, numpages,
990 __pgprot(_PAGE_CACHE_WC), 0); 1012 __pgprot(_PAGE_CACHE_WC),
1013 __pgprot(_PAGE_CACHE_MASK),
1014 0, 0, NULL);
991 } 1015 }
992 return ret; 1016 return ret;
993} 1017}
@@ -1104,7 +1128,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray)
1104 int free_idx; 1128 int free_idx;
1105 1129
1106 for (i = 0; i < addrinarray; i++) { 1130 for (i = 0; i < addrinarray; i++) {
1107 start = (unsigned long)page_address(pages[i]); 1131 if (PageHighMem(pages[i]))
1132 continue;
1133 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1108 end = start + PAGE_SIZE; 1134 end = start + PAGE_SIZE;
1109 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) 1135 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
1110 goto err_out; 1136 goto err_out;
@@ -1117,7 +1143,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray)
1117err_out: 1143err_out:
1118 free_idx = i; 1144 free_idx = i;
1119 for (i = 0; i < free_idx; i++) { 1145 for (i = 0; i < free_idx; i++) {
1120 start = (unsigned long)page_address(pages[i]); 1146 if (PageHighMem(pages[i]))
1147 continue;
1148 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1121 end = start + PAGE_SIZE; 1149 end = start + PAGE_SIZE;
1122 free_memtype(start, end); 1150 free_memtype(start, end);
1123 } 1151 }
@@ -1146,7 +1174,9 @@ int set_pages_array_wb(struct page **pages, int addrinarray)
1146 return retval; 1174 return retval;
1147 1175
1148 for (i = 0; i < addrinarray; i++) { 1176 for (i = 0; i < addrinarray; i++) {
1149 start = (unsigned long)page_address(pages[i]); 1177 if (PageHighMem(pages[i]))
1178 continue;
1179 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1150 end = start + PAGE_SIZE; 1180 end = start + PAGE_SIZE;
1151 free_memtype(start, end); 1181 free_memtype(start, end);
1152 } 1182 }
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8e43bdd45456..ed34f5e35999 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -25,7 +25,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
25 return pte; 25 return pte;
26} 26}
27 27
28void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) 28void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
29{ 29{
30 pgtable_page_dtor(pte); 30 pgtable_page_dtor(pte);
31 paravirt_release_pte(page_to_pfn(pte)); 31 paravirt_release_pte(page_to_pfn(pte));
@@ -33,14 +33,14 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
33} 33}
34 34
35#if PAGETABLE_LEVELS > 2 35#if PAGETABLE_LEVELS > 2
36void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) 36void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
37{ 37{
38 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); 38 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
39 tlb_remove_page(tlb, virt_to_page(pmd)); 39 tlb_remove_page(tlb, virt_to_page(pmd));
40} 40}
41 41
42#if PAGETABLE_LEVELS > 3 42#if PAGETABLE_LEVELS > 3
43void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 43void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
44{ 44{
45 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 45 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
46 tlb_remove_page(tlb, virt_to_page(pud)); 46 tlb_remove_page(tlb, virt_to_page(pud));
@@ -329,7 +329,6 @@ void __init reserve_top_address(unsigned long reserve)
329 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", 329 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
330 (int)-reserve); 330 (int)-reserve);
331 __FIXADDR_TOP = -reserve - PAGE_SIZE; 331 __FIXADDR_TOP = -reserve - PAGE_SIZE;
332 __VMALLOC_RESERVE += reserve;
333#endif 332#endif
334} 333}
335 334
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 2dfcbf9df2ae..dbb5381f7b3b 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -79,8 +79,10 @@ static __init void bad_srat(void)
79 acpi_numa = -1; 79 acpi_numa = -1;
80 for (i = 0; i < MAX_LOCAL_APIC; i++) 80 for (i = 0; i < MAX_LOCAL_APIC; i++)
81 apicid_to_node[i] = NUMA_NO_NODE; 81 apicid_to_node[i] = NUMA_NO_NODE;
82 for (i = 0; i < MAX_NUMNODES; i++) 82 for (i = 0; i < MAX_NUMNODES; i++) {
83 nodes_add[i].start = nodes[i].end = 0; 83 nodes[i].start = nodes[i].end = 0;
84 nodes_add[i].start = nodes_add[i].end = 0;
85 }
84 remove_all_active_ranges(); 86 remove_all_active_ranges();
85} 87}
86 88
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index b07dd8d0b321..89b9a5cd63da 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -390,7 +390,7 @@ static int __init p4_init(char **cpu_type)
390static int force_arch_perfmon; 390static int force_arch_perfmon;
391static int force_cpu_type(const char *str, struct kernel_param *kp) 391static int force_cpu_type(const char *str, struct kernel_param *kp)
392{ 392{
393 if (!strcmp(str, "archperfmon")) { 393 if (!strcmp(str, "arch_perfmon")) {
394 force_arch_perfmon = 1; 394 force_arch_perfmon = 1;
395 printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); 395 printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
396 } 396 }
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index b26626dc517c..1014eb4bfc37 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -68,6 +68,10 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
68 unsigned long flags; 68 unsigned long flags;
69 struct resource *root; 69 struct resource *root;
70 int max_root_bus_resources = PCI_BUS_NUM_RESOURCES; 70 int max_root_bus_resources = PCI_BUS_NUM_RESOURCES;
71 u64 start, end;
72
73 if (bus_has_transparent_bridge(info->bus))
74 max_root_bus_resources -= 3;
71 75
72 status = resource_to_addr(acpi_res, &addr); 76 status = resource_to_addr(acpi_res, &addr);
73 if (!ACPI_SUCCESS(status)) 77 if (!ACPI_SUCCESS(status))
@@ -84,25 +88,24 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
84 } else 88 } else
85 return AE_OK; 89 return AE_OK;
86 90
87 res = &info->res[info->res_num]; 91 start = addr.minimum + addr.translation_offset;
88 res->name = info->name; 92 end = start + addr.address_length - 1;
89 res->flags = flags;
90 res->start = addr.minimum + addr.translation_offset;
91 res->end = res->start + addr.address_length - 1;
92 res->child = NULL;
93
94 if (bus_has_transparent_bridge(info->bus))
95 max_root_bus_resources -= 3;
96 if (info->res_num >= max_root_bus_resources) { 93 if (info->res_num >= max_root_bus_resources) {
97 printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx " 94 printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx "
98 "from %s for %s due to _CRS returning more than " 95 "from %s for %s due to _CRS returning more than "
99 "%d resource descriptors\n", (unsigned long) res->start, 96 "%d resource descriptors\n", (unsigned long) start,
100 (unsigned long) res->end, root->name, info->name, 97 (unsigned long) end, root->name, info->name,
101 max_root_bus_resources); 98 max_root_bus_resources);
102 info->res_num++;
103 return AE_OK; 99 return AE_OK;
104 } 100 }
105 101
102 res = &info->res[info->res_num];
103 res->name = info->name;
104 res->flags = flags;
105 res->start = start;
106 res->end = end;
107 res->child = NULL;
108
106 if (insert_resource(root, res)) { 109 if (insert_resource(root, res)) {
107 printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx " 110 printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx "
108 "from %s for %s\n", (unsigned long) res->start, 111 "from %s for %s\n", (unsigned long) res->start,
@@ -115,23 +118,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
115} 118}
116 119
117static void 120static void
118adjust_transparent_bridge_resources(struct pci_bus *bus)
119{
120 struct pci_dev *dev;
121
122 list_for_each_entry(dev, &bus->devices, bus_list) {
123 int i;
124 u16 class = dev->class >> 8;
125
126 if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent) {
127 for(i = 3; i < PCI_BUS_NUM_RESOURCES; i++)
128 dev->subordinate->resource[i] =
129 dev->bus->resource[i - 3];
130 }
131 }
132}
133
134static void
135get_current_resources(struct acpi_device *device, int busnum, 121get_current_resources(struct acpi_device *device, int busnum,
136 int domain, struct pci_bus *bus) 122 int domain, struct pci_bus *bus)
137{ 123{
@@ -158,8 +144,6 @@ get_current_resources(struct acpi_device *device, int busnum,
158 info.res_num = 0; 144 info.res_num = 0;
159 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, 145 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
160 &info); 146 &info);
161 if (info.res_num)
162 adjust_transparent_bridge_resources(bus);
163 147
164 return; 148 return;
165 149
@@ -222,8 +206,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
222 */ 206 */
223 memcpy(bus->sysdata, sd, sizeof(*sd)); 207 memcpy(bus->sysdata, sd, sizeof(*sd));
224 kfree(sd); 208 kfree(sd);
225 } else 209 } else {
226 bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); 210 bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd);
211 if (bus) {
212 if (pci_probe & PCI_USE__CRS)
213 get_current_resources(device, busnum, domain,
214 bus);
215 bus->subordinate = pci_scan_child_bus(bus);
216 }
217 }
227 218
228 if (!bus) 219 if (!bus)
229 kfree(sd); 220 kfree(sd);
@@ -238,8 +229,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
238#endif 229#endif
239 } 230 }
240 231
241 if (bus && (pci_probe & PCI_USE__CRS))
242 get_current_resources(device, busnum, domain, bus);
243 return bus; 232 return bus;
244} 233}
245 234
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index f893d6a6e803..3ffa10df20b9 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -100,8 +100,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
100 int j; 100 int j;
101 struct pci_root_info *info; 101 struct pci_root_info *info;
102 102
103 /* don't go for it if _CRS is used */ 103 /* don't go for it if _CRS is used already */
104 if (pci_probe & PCI_USE__CRS) 104 if (b->resource[0] != &ioport_resource ||
105 b->resource[1] != &iomem_resource)
105 return; 106 return;
106 107
107 /* if only one root bus, don't need to anything */ 108 /* if only one root bus, don't need to anything */
@@ -116,6 +117,9 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
116 if (i == pci_root_num) 117 if (i == pci_root_num)
117 return; 118 return;
118 119
120 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
121 b->number);
122
119 info = &pci_root_info[i]; 123 info = &pci_root_info[i];
120 for (j = 0; j < info->res_num; j++) { 124 for (j = 0; j < info->res_num; j++) {
121 struct resource *res; 125 struct resource *res;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 0fb56db16d18..52e62e57fedd 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -35,6 +35,7 @@
35#include <asm/pat.h> 35#include <asm/pat.h>
36#include <asm/e820.h> 36#include <asm/e820.h>
37#include <asm/pci_x86.h> 37#include <asm/pci_x86.h>
38#include <asm/io_apic.h>
38 39
39 40
40static int 41static int
@@ -227,6 +228,12 @@ void __init pcibios_resource_survey(void)
227 pcibios_allocate_resources(1); 228 pcibios_allocate_resources(1);
228 229
229 e820_reserve_resources_late(); 230 e820_reserve_resources_late();
231 /*
232 * Insert the IO APIC resources after PCI initialization has
233 * occured to handle IO APICS that are mapped in on a BAR in
234 * PCI space, but before trying to assign unassigned pci res.
235 */
236 ioapic_insert_resources();
230} 237}
231 238
232/** 239/**
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index de2abbd07544..a6a198c33623 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -1,7 +1,7 @@
1# __restore_processor_state() restores %gs after S3 resume and so should not 1# __restore_processor_state() restores %gs after S3 resume and so should not
2# itself be stack-protected 2# itself be stack-protected
3nostackp := $(call cc-option, -fno-stack-protector) 3nostackp := $(call cc-option, -fno-stack-protector)
4CFLAGS_cpu_$(BITS).o := $(nostackp) 4CFLAGS_cpu.o := $(nostackp)
5 5
6obj-$(CONFIG_PM_SLEEP) += cpu.o 6obj-$(CONFIG_PM_SLEEP) += cpu.o
7obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o 7obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index d277ef1eea51..b3d20b9cac63 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -244,7 +244,7 @@ static void __restore_processor_state(struct saved_context *ctxt)
244 do_fpu_end(); 244 do_fpu_end();
245 mtrr_ap_init(); 245 mtrr_ap_init();
246 246
247#ifdef CONFIG_X86_32 247#ifdef CONFIG_X86_OLD_MCE
248 mcheck_init(&boot_cpu_data); 248 mcheck_init(&boot_cpu_data);
249#endif 249#endif
250} 250}