aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig33
-rw-r--r--arch/x86/Kconfig.debug20
-rw-r--r--arch/x86/boot/a20.c3
-rw-r--r--arch/x86/boot/printf.c2
-rw-r--r--arch/x86/kernel/.gitignore1
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c16
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S38
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h5
-rw-r--r--arch/x86/kernel/acpi/sleep.c16
-rw-r--r--arch/x86/kernel/apic_64.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c15
-rw-r--r--arch/x86/kernel/efi_32.c8
-rw-r--r--arch/x86/kernel/entry_32.S1
-rw-r--r--arch/x86/kernel/geode_32.c5
-rw-r--r--arch/x86/kernel/head_32.S2
-rw-r--r--arch/x86/kernel/head_64.S2
-rw-r--r--arch/x86/kernel/i387.c48
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/io_apic_32.c12
-rw-r--r--arch/x86/kernel/kvmclock.c93
-rw-r--r--arch/x86/kernel/mfgpt_32.c2
-rw-r--r--arch/x86/kernel/nmi_32.c9
-rw-r--r--arch/x86/kernel/pci-dma.c14
-rw-r--r--arch/x86/kernel/pci-gart_64.c31
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c6
-rw-r--r--arch/x86/kernel/pvclock.c141
-rw-r--r--arch/x86/kernel/rtc.c34
-rw-r--r--arch/x86/kernel/setup_32.c10
-rw-r--r--arch/x86/kernel/smpboot.c6
-rw-r--r--arch/x86/kernel/traps_32.c1
-rw-r--r--arch/x86/kernel/tsc_32.c23
-rw-r--r--arch/x86/kernel/tsc_64.c5
-rw-r--r--arch/x86/kvm/i8254.c25
-rw-r--r--arch/x86/kvm/irq.c6
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/lapic.c3
-rw-r--r--arch/x86/kvm/mmu.c24
-rw-r--r--arch/x86/kvm/paging_tmpl.h2
-rw-r--r--arch/x86/kvm/svm.c2
-rw-r--r--arch/x86/kvm/vmx.c22
-rw-r--r--arch/x86/kvm/x86.c93
-rw-r--r--arch/x86/kvm/x86_emulate.c10
-rw-r--r--arch/x86/lguest/boot.c5
-rw-r--r--arch/x86/lib/copy_user_64.S25
-rw-r--r--arch/x86/lib/copy_user_nocache_64.S25
-rw-r--r--arch/x86/lib/delay_32.c31
-rw-r--r--arch/x86/lib/delay_64.c30
-rw-r--r--arch/x86/math-emu/fpu_entry.c13
-rw-r--r--arch/x86/mm/fault.c5
-rw-r--r--arch/x86/mm/init_64.c13
-rw-r--r--arch/x86/mm/ioremap.c30
-rw-r--r--arch/x86/mm/pat.c51
-rw-r--r--arch/x86/mm/srat_64.c27
-rw-r--r--arch/x86/pci/common.c8
-rw-r--r--arch/x86/pci/init.c3
-rw-r--r--arch/x86/pci/irq.c7
-rw-r--r--arch/x86/pci/olpc.c5
-rw-r--r--arch/x86/pci/pci.h2
-rw-r--r--arch/x86/vdso/vclock_gettime.c6
-rw-r--r--arch/x86/xen/Kconfig3
-rw-r--r--arch/x86/xen/enlighten.c56
-rw-r--r--arch/x86/xen/mmu.c77
-rw-r--r--arch/x86/xen/mmu.h24
-rw-r--r--arch/x86/xen/time.c145
-rw-r--r--arch/x86/xen/xen-head.S6
68 files changed, 834 insertions, 566 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fe361ae7ef2f..bf07b6f50fa1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -26,17 +26,10 @@ config X86
26 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 26 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
27 select HAVE_ARCH_KGDB if !X86_VOYAGER 27 select HAVE_ARCH_KGDB if !X86_VOYAGER
28 28
29config DEFCONFIG_LIST 29config ARCH_DEFCONFIG
30 string 30 string
31 depends on X86_32 31 default "arch/x86/configs/i386_defconfig" if X86_32
32 option defconfig_list 32 default "arch/x86/configs/x86_64_defconfig" if X86_64
33 default "arch/x86/configs/i386_defconfig"
34
35config DEFCONFIG_LIST
36 string
37 depends on X86_64
38 option defconfig_list
39 default "arch/x86/configs/x86_64_defconfig"
40 33
41 34
42config GENERIC_LOCKBREAK 35config GENERIC_LOCKBREAK
@@ -390,6 +383,7 @@ config VMI
390config KVM_CLOCK 383config KVM_CLOCK
391 bool "KVM paravirtualized clock" 384 bool "KVM paravirtualized clock"
392 select PARAVIRT 385 select PARAVIRT
386 select PARAVIRT_CLOCK
393 depends on !(X86_VISWS || X86_VOYAGER) 387 depends on !(X86_VISWS || X86_VOYAGER)
394 help 388 help
395 Turning on this option will allow you to run a paravirtualized clock 389 Turning on this option will allow you to run a paravirtualized clock
@@ -417,6 +411,10 @@ config PARAVIRT
417 over full virtualization. However, when run without a hypervisor 411 over full virtualization. However, when run without a hypervisor
418 the kernel is theoretically slower and slightly larger. 412 the kernel is theoretically slower and slightly larger.
419 413
414config PARAVIRT_CLOCK
415 bool
416 default n
417
420endif 418endif
421 419
422config MEMTEST_BOOTPARAM 420config MEMTEST_BOOTPARAM
@@ -968,8 +966,8 @@ config NUMA_EMU
968 number of nodes. This is only useful for debugging. 966 number of nodes. This is only useful for debugging.
969 967
970config NODES_SHIFT 968config NODES_SHIFT
971 int "Max num nodes shift(1-15)" 969 int "Max num nodes shift(1-9)"
972 range 1 15 if X86_64 970 range 1 9 if X86_64
973 default "6" if X86_64 971 default "6" if X86_64
974 default "4" if X86_NUMAQ 972 default "4" if X86_NUMAQ
975 default "3" 973 default "3"
@@ -1515,13 +1513,13 @@ config PCI_GOMMCONFIG
1515config PCI_GODIRECT 1513config PCI_GODIRECT
1516 bool "Direct" 1514 bool "Direct"
1517 1515
1518config PCI_GOANY
1519 bool "Any"
1520
1521config PCI_GOOLPC 1516config PCI_GOOLPC
1522 bool "OLPC" 1517 bool "OLPC"
1523 depends on OLPC 1518 depends on OLPC
1524 1519
1520config PCI_GOANY
1521 bool "Any"
1522
1525endchoice 1523endchoice
1526 1524
1527config PCI_BIOS 1525config PCI_BIOS
@@ -1538,9 +1536,8 @@ config PCI_MMCONFIG
1538 depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) 1536 depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
1539 1537
1540config PCI_OLPC 1538config PCI_OLPC
1541 bool 1539 def_bool y
1542 depends on PCI && PCI_GOOLPC 1540 depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
1543 default y
1544 1541
1545config PCI_DOMAINS 1542config PCI_DOMAINS
1546 def_bool y 1543 def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index ac1e31ba4795..18363374d51a 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -6,15 +6,19 @@ config TRACE_IRQFLAGS_SUPPORT
6source "lib/Kconfig.debug" 6source "lib/Kconfig.debug"
7 7
8config NONPROMISC_DEVMEM 8config NONPROMISC_DEVMEM
9 bool "Disable promiscuous /dev/mem" 9 bool "Filter access to /dev/mem"
10 help 10 help
11 The /dev/mem file by default only allows userspace access to PCI 11 If this option is left off, you allow userspace access to all
12 space and the BIOS code and data regions. This is sufficient for 12 of memory, including kernel and userspace memory. Accidental
13 dosemu and X and all common users of /dev/mem. With this config 13 access to this is obviously disastrous, but specific access can
14 option, you allow userspace access to all of memory, including 14 be used by people debugging the kernel.
15 kernel and userspace memory. Accidental access to this is 15
16 obviously disasterous, but specific access can be used by people 16 If this option is switched on, the /dev/mem file only allows
17 debugging the kernel. 17 userspace access to PCI space and the BIOS code and data regions.
18 This is sufficient for dosemu and X and all common users of
19 /dev/mem.
20
21 If in doubt, say Y.
18 22
19config EARLY_PRINTK 23config EARLY_PRINTK
20 bool "Early printk" if EMBEDDED 24 bool "Early printk" if EMBEDDED
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 90943f83e84d..e01aafd03bde 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -115,8 +115,6 @@ static void enable_a20_fast(void)
115 115
116int enable_a20(void) 116int enable_a20(void)
117{ 117{
118 int loops = A20_ENABLE_LOOPS;
119
120#if defined(CONFIG_X86_ELAN) 118#if defined(CONFIG_X86_ELAN)
121 /* Elan croaks if we try to touch the KBC */ 119 /* Elan croaks if we try to touch the KBC */
122 enable_a20_fast(); 120 enable_a20_fast();
@@ -128,6 +126,7 @@ int enable_a20(void)
128 enable_a20_kbc(); 126 enable_a20_kbc();
129 return 0; 127 return 0;
130#else 128#else
129 int loops = A20_ENABLE_LOOPS;
131 while (loops--) { 130 while (loops--) {
132 /* First, check to see if A20 is already enabled 131 /* First, check to see if A20 is already enabled
133 (legacy free, etc.) */ 132 (legacy free, etc.) */
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
index c1d00c0274c4..50e47cdbdddd 100644
--- a/arch/x86/boot/printf.c
+++ b/arch/x86/boot/printf.c
@@ -56,7 +56,7 @@ static char *number(char *str, long num, int base, int size, int precision,
56 if (type & LEFT) 56 if (type & LEFT)
57 type &= ~ZEROPAD; 57 type &= ~ZEROPAD;
58 if (base < 2 || base > 36) 58 if (base < 2 || base > 36)
59 return 0; 59 return NULL;
60 c = (type & ZEROPAD) ? '0' : ' '; 60 c = (type & ZEROPAD) ? '0' : ' ';
61 sign = 0; 61 sign = 0;
62 if (type & SIGN) { 62 if (type & SIGN) {
diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore
index 4ea38a39aed4..08f4fd731469 100644
--- a/arch/x86/kernel/.gitignore
+++ b/arch/x86/kernel/.gitignore
@@ -1,2 +1,3 @@
1vsyscall.lds 1vsyscall.lds
2vsyscall_32.lds 2vsyscall_32.lds
3vmlinux.lds
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b4720..77807d4769c9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
82obj-$(CONFIG_KVM_GUEST) += kvm.o 82obj-$(CONFIG_KVM_GUEST) += kvm.o
83obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 83obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
84obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 84obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
85obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
85 86
86obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 87obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
87 88
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c49ebcc6c41e..33c5216fd3e1 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -242,12 +242,19 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
242 242
243static void __cpuinit acpi_register_lapic(int id, u8 enabled) 243static void __cpuinit acpi_register_lapic(int id, u8 enabled)
244{ 244{
245 unsigned int ver = 0;
246
245 if (!enabled) { 247 if (!enabled) {
246 ++disabled_cpus; 248 ++disabled_cpus;
247 return; 249 return;
248 } 250 }
249 251
250 generic_processor_info(id, 0); 252#ifdef CONFIG_X86_32
253 if (boot_cpu_physical_apicid != -1U)
254 ver = apic_version[boot_cpu_physical_apicid];
255#endif
256
257 generic_processor_info(id, ver);
251} 258}
252 259
253static int __init 260static int __init
@@ -767,8 +774,13 @@ static void __init acpi_register_lapic_address(unsigned long address)
767 mp_lapic_addr = address; 774 mp_lapic_addr = address;
768 775
769 set_fixmap_nocache(FIX_APIC_BASE, address); 776 set_fixmap_nocache(FIX_APIC_BASE, address);
770 if (boot_cpu_physical_apicid == -1U) 777 if (boot_cpu_physical_apicid == -1U) {
771 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 778 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
779#ifdef CONFIG_X86_32
780 apic_version[boot_cpu_physical_apicid] =
781 GET_APIC_VERSION(apic_read(APIC_LVR));
782#endif
783 }
772} 784}
773 785
774static int __init early_acpi_parse_madt_lapic_addr_ovr(void) 786static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index f9b77fb37e5b..3355973b12ac 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -5,6 +5,7 @@
5#include <asm/msr-index.h> 5#include <asm/msr-index.h>
6#include <asm/page.h> 6#include <asm/page.h>
7#include <asm/pgtable.h> 7#include <asm/pgtable.h>
8#include <asm/processor-flags.h>
8 9
9 .code16 10 .code16
10 .section ".header", "a" 11 .section ".header", "a"
@@ -24,6 +25,11 @@ pmode_gdt: .quad 0
24realmode_flags: .long 0 25realmode_flags: .long 0
25real_magic: .long 0 26real_magic: .long 0
26trampoline_segment: .word 0 27trampoline_segment: .word 0
28_pad1: .byte 0
29wakeup_jmp: .byte 0xea /* ljmpw */
30wakeup_jmp_off: .word 3f
31wakeup_jmp_seg: .word 0
32wakeup_gdt: .quad 0, 0, 0
27signature: .long 0x51ee1111 33signature: .long 0x51ee1111
28 34
29 .text 35 .text
@@ -34,11 +40,34 @@ _start:
34 cli 40 cli
35 cld 41 cld
36 42
43 /* Apparently some dimwit BIOS programmers don't know how to
44 program a PM to RM transition, and we might end up here with
45 junk in the data segment descriptor registers. The only way
46 to repair that is to go into PM and fix it ourselves... */
47 movw $16, %cx
48 lgdtl %cs:wakeup_gdt
49 movl %cr0, %eax
50 orb $X86_CR0_PE, %al
51 movl %eax, %cr0
52 jmp 1f
531: ljmpw $8, $2f
542:
55 movw %cx, %ds
56 movw %cx, %es
57 movw %cx, %ss
58 movw %cx, %fs
59 movw %cx, %gs
60
61 andb $~X86_CR0_PE, %al
62 movl %eax, %cr0
63 jmp wakeup_jmp
643:
37 /* Set up segments */ 65 /* Set up segments */
38 movw %cs, %ax 66 movw %cs, %ax
39 movw %ax, %ds 67 movw %ax, %ds
40 movw %ax, %es 68 movw %ax, %es
41 movw %ax, %ss 69 movw %ax, %ss
70 lidtl wakeup_idt
42 71
43 movl $wakeup_stack_end, %esp 72 movl $wakeup_stack_end, %esp
44 73
@@ -98,7 +127,14 @@ bogus_real_magic:
98 jmp 1b 127 jmp 1b
99 128
100 .data 129 .data
101 .balign 4 130 .balign 8
131
132 /* This is the standard real-mode IDT */
133wakeup_idt:
134 .word 0xffff /* limit */
135 .long 0 /* address */
136 .word 0
137
102 .globl HEAP, heap_end 138 .globl HEAP, heap_end
103HEAP: 139HEAP:
104 .long wakeup_heap 140 .long wakeup_heap
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index ef8166fe8020..69d38d0b2b64 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -24,6 +24,11 @@ struct wakeup_header {
24 u32 realmode_flags; 24 u32 realmode_flags;
25 u32 real_magic; 25 u32 real_magic;
26 u16 trampoline_segment; /* segment with trampoline code, 64-bit only */ 26 u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
27 u8 _pad1;
28 u8 wakeup_jmp;
29 u16 wakeup_jmp_off;
30 u16 wakeup_jmp_seg;
31 u64 wakeup_gdt[3];
27 u32 signature; /* To check we have correct structure */ 32 u32 signature; /* To check we have correct structure */
28} __attribute__((__packed__)); 33} __attribute__((__packed__));
29 34
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index afc25ee9964b..36af01f029ed 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -50,6 +50,20 @@ int acpi_save_state_mem(void)
50 50
51 header->video_mode = saved_video_mode; 51 header->video_mode = saved_video_mode;
52 52
53 header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
54 /* GDT[0]: GDT self-pointer */
55 header->wakeup_gdt[0] =
56 (u64)(sizeof(header->wakeup_gdt) - 1) +
57 ((u64)(acpi_wakeup_address +
58 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
59 << 16);
60 /* GDT[1]: real-mode-like code segment */
61 header->wakeup_gdt[1] = (0x009bULL << 40) +
62 ((u64)acpi_wakeup_address << 16) + 0xffff;
63 /* GDT[2]: real-mode-like data segment */
64 header->wakeup_gdt[2] = (0x0093ULL << 40) +
65 ((u64)acpi_wakeup_address << 16) + 0xffff;
66
53#ifndef CONFIG_64BIT 67#ifndef CONFIG_64BIT
54 store_gdt((struct desc_ptr *)&header->pmode_gdt); 68 store_gdt((struct desc_ptr *)&header->pmode_gdt);
55 69
@@ -111,7 +125,7 @@ void __init acpi_reserve_bootmem(void)
111 return; 125 return;
112 } 126 }
113 127
114 acpi_wakeup_address = acpi_realmode; 128 acpi_wakeup_address = virt_to_phys((void *)acpi_realmode);
115} 129}
116 130
117 131
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 5910020c3f24..0633cfd0dc29 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -534,7 +534,7 @@ int setup_profiling_timer(unsigned int multiplier)
534 */ 534 */
535void clear_local_APIC(void) 535void clear_local_APIC(void)
536{ 536{
537 int maxlvt = lapic_get_maxlvt(); 537 int maxlvt;
538 u32 v; 538 u32 v;
539 539
540 /* APIC hasn't been mapped yet */ 540 /* APIC hasn't been mapped yet */
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index af4a867a097c..777a7ff075de 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -245,7 +245,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
245 if ((ecx > 95) || (ecx == 0) || (eax < ebx)) 245 if ((ecx > 95) || (ecx == 0) || (eax < ebx))
246 return -EIO; 246 return -EIO;
247 247
248 edx = (eax - ebx) / (100 - ecx); 248 edx = ((eax - ebx) * 100) / (100 - ecx);
249 *low_freq = edx * 1000; /* back to kHz */ 249 *low_freq = edx * 1000; /* back to kHz */
250 250
251 dprintk("low frequency is %u kHz\n", *low_freq); 251 dprintk("low frequency is %u kHz\n", *low_freq);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 46d4034d9f37..206791eb46e3 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1127,12 +1127,23 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1127 * an UP version, and is deprecated by AMD. 1127 * an UP version, and is deprecated by AMD.
1128 */ 1128 */
1129 if (num_online_cpus() != 1) { 1129 if (num_online_cpus() != 1) {
1130 printk(KERN_ERR PFX "MP systems not supported by PSB BIOS structure\n"); 1130#ifndef CONFIG_ACPI_PROCESSOR
1131 printk(KERN_ERR PFX "ACPI Processor support is required "
1132 "for SMP systems but is absent. Please load the "
1133 "ACPI Processor module before starting this "
1134 "driver.\n");
1135#else
1136 printk(KERN_ERR PFX "Your BIOS does not provide ACPI "
1137 "_PSS objects in a way that Linux understands. "
1138 "Please report this to the Linux ACPI maintainers"
1139 " and complain to your BIOS vendor.\n");
1140#endif
1131 kfree(data); 1141 kfree(data);
1132 return -ENODEV; 1142 return -ENODEV;
1133 } 1143 }
1134 if (pol->cpu != 0) { 1144 if (pol->cpu != 0) {
1135 printk(KERN_ERR PFX "No _PSS objects for CPU other than CPU0\n"); 1145 printk(KERN_ERR PFX "No ACPI _PSS objects for CPU other than "
1146 "CPU0. Complain to your BIOS vendor.\n");
1136 kfree(data); 1147 kfree(data);
1137 return -ENODEV; 1148 return -ENODEV;
1138 } 1149 }
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index 5d23d85624d4..4b63c8e1f13b 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -49,13 +49,13 @@ void efi_call_phys_prelog(void)
49 local_irq_save(efi_rt_eflags); 49 local_irq_save(efi_rt_eflags);
50 50
51 /* 51 /*
52 * If I don't have PSE, I should just duplicate two entries in page 52 * If I don't have PAE, I should just duplicate two entries in page
53 * directory. If I have PSE, I just need to duplicate one entry in 53 * directory. If I have PAE, I just need to duplicate one entry in
54 * page directory. 54 * page directory.
55 */ 55 */
56 cr4 = read_cr4(); 56 cr4 = read_cr4();
57 57
58 if (cr4 & X86_CR4_PSE) { 58 if (cr4 & X86_CR4_PAE) {
59 efi_bak_pg_dir_pointer[0].pgd = 59 efi_bak_pg_dir_pointer[0].pgd =
60 swapper_pg_dir[pgd_index(0)].pgd; 60 swapper_pg_dir[pgd_index(0)].pgd;
61 swapper_pg_dir[0].pgd = 61 swapper_pg_dir[0].pgd =
@@ -93,7 +93,7 @@ void efi_call_phys_epilog(void)
93 93
94 cr4 = read_cr4(); 94 cr4 = read_cr4();
95 95
96 if (cr4 & X86_CR4_PSE) { 96 if (cr4 & X86_CR4_PAE) {
97 swapper_pg_dir[pgd_index(0)].pgd = 97 swapper_pg_dir[pgd_index(0)].pgd =
98 efi_bak_pg_dir_pointer[0].pgd; 98 efi_bak_pg_dir_pointer[0].pgd;
99 } else { 99 } else {
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 2a609dc3271c..c778e4fa55a2 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -248,6 +248,7 @@ ENTRY(resume_userspace)
248 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt 248 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
249 # setting need_resched or sigpending 249 # setting need_resched or sigpending
250 # between sampling and the iret 250 # between sampling and the iret
251 TRACE_IRQS_OFF
251 movl TI_flags(%ebp), %ecx 252 movl TI_flags(%ebp), %ecx
252 andl $_TIF_WORK_MASK, %ecx # is there any work to be done on 253 andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
253 # int/exception return? 254 # int/exception return?
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
index e8edd63ab000..9b08e852fd1a 100644
--- a/arch/x86/kernel/geode_32.c
+++ b/arch/x86/kernel/geode_32.c
@@ -166,6 +166,8 @@ int geode_has_vsa2(void)
166 static int has_vsa2 = -1; 166 static int has_vsa2 = -1;
167 167
168 if (has_vsa2 == -1) { 168 if (has_vsa2 == -1) {
169 u16 val;
170
169 /* 171 /*
170 * The VSA has virtual registers that we can query for a 172 * The VSA has virtual registers that we can query for a
171 * signature. 173 * signature.
@@ -173,7 +175,8 @@ int geode_has_vsa2(void)
173 outw(VSA_VR_UNLOCK, VSA_VRC_INDEX); 175 outw(VSA_VR_UNLOCK, VSA_VRC_INDEX);
174 outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX); 176 outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX);
175 177
176 has_vsa2 = (inw(VSA_VRC_DATA) == VSA_SIG); 178 val = inw(VSA_VRC_DATA);
179 has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG);
177 } 180 }
178 181
179 return has_vsa2; 182 return has_vsa2;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index b2cc73768a9d..f7357cc0162c 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -189,7 +189,7 @@ default_entry:
189 * this stage. 189 * this stage.
190 */ 190 */
191 191
192#define KPMDS ((0x100000000-__PAGE_OFFSET) >> 30) /* Number of kernel PMDs */ 192#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
193 193
194 xorl %ebx,%ebx /* %ebx is kept at zero */ 194 xorl %ebx,%ebx /* %ebx is kept at zero */
195 195
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 10a1955bb1d1..b817974ef942 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -128,7 +128,7 @@ ident_complete:
128 /* Fixup phys_base */ 128 /* Fixup phys_base */
129 addq %rbp, phys_base(%rip) 129 addq %rbp, phys_base(%rip)
130 130
131#ifdef CONFIG_SMP 131#ifdef CONFIG_X86_TRAMPOLINE
132 addq %rbp, trampoline_level4_pgt + 0(%rip) 132 addq %rbp, trampoline_level4_pgt + 0(%rip)
133 addq %rbp, trampoline_level4_pgt + (511*8)(%rip) 133 addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
134#endif 134#endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index e03cc952f233..95e80e5033c3 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -56,6 +56,11 @@ void __cpuinit mxcsr_feature_mask_init(void)
56 56
57void __init init_thread_xstate(void) 57void __init init_thread_xstate(void)
58{ 58{
59 if (!HAVE_HWFP) {
60 xstate_size = sizeof(struct i387_soft_struct);
61 return;
62 }
63
59 if (cpu_has_fxsr) 64 if (cpu_has_fxsr)
60 xstate_size = sizeof(struct i387_fxsave_struct); 65 xstate_size = sizeof(struct i387_fxsave_struct);
61#ifdef CONFIG_X86_32 66#ifdef CONFIG_X86_32
@@ -94,7 +99,7 @@ void __cpuinit fpu_init(void)
94int init_fpu(struct task_struct *tsk) 99int init_fpu(struct task_struct *tsk)
95{ 100{
96 if (tsk_used_math(tsk)) { 101 if (tsk_used_math(tsk)) {
97 if (tsk == current) 102 if (HAVE_HWFP && tsk == current)
98 unlazy_fpu(tsk); 103 unlazy_fpu(tsk);
99 return 0; 104 return 0;
100 } 105 }
@@ -109,6 +114,15 @@ int init_fpu(struct task_struct *tsk)
109 return -ENOMEM; 114 return -ENOMEM;
110 } 115 }
111 116
117#ifdef CONFIG_X86_32
118 if (!HAVE_HWFP) {
119 memset(tsk->thread.xstate, 0, xstate_size);
120 finit();
121 set_stopped_child_used_math(tsk);
122 return 0;
123 }
124#endif
125
112 if (cpu_has_fxsr) { 126 if (cpu_has_fxsr) {
113 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; 127 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
114 128
@@ -148,7 +162,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
148 int ret; 162 int ret;
149 163
150 if (!cpu_has_fxsr) 164 if (!cpu_has_fxsr)
151 return -ENODEV; 165 return -EIO;
152 166
153 ret = init_fpu(target); 167 ret = init_fpu(target);
154 if (ret) 168 if (ret)
@@ -165,7 +179,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
165 int ret; 179 int ret;
166 180
167 if (!cpu_has_fxsr) 181 if (!cpu_has_fxsr)
168 return -ENODEV; 182 return -EIO;
169 183
170 ret = init_fpu(target); 184 ret = init_fpu(target);
171 if (ret) 185 if (ret)
@@ -330,13 +344,13 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
330 struct user_i387_ia32_struct env; 344 struct user_i387_ia32_struct env;
331 int ret; 345 int ret;
332 346
333 if (!HAVE_HWFP)
334 return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
335
336 ret = init_fpu(target); 347 ret = init_fpu(target);
337 if (ret) 348 if (ret)
338 return ret; 349 return ret;
339 350
351 if (!HAVE_HWFP)
352 return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
353
340 if (!cpu_has_fxsr) { 354 if (!cpu_has_fxsr) {
341 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, 355 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
342 &target->thread.xstate->fsave, 0, 356 &target->thread.xstate->fsave, 0,
@@ -360,15 +374,15 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
360 struct user_i387_ia32_struct env; 374 struct user_i387_ia32_struct env;
361 int ret; 375 int ret;
362 376
363 if (!HAVE_HWFP)
364 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
365
366 ret = init_fpu(target); 377 ret = init_fpu(target);
367 if (ret) 378 if (ret)
368 return ret; 379 return ret;
369 380
370 set_stopped_child_used_math(target); 381 set_stopped_child_used_math(target);
371 382
383 if (!HAVE_HWFP)
384 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
385
372 if (!cpu_has_fxsr) { 386 if (!cpu_has_fxsr) {
373 return user_regset_copyin(&pos, &count, &kbuf, &ubuf, 387 return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
374 &target->thread.xstate->fsave, 0, -1); 388 &target->thread.xstate->fsave, 0, -1);
@@ -474,18 +488,18 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
474int restore_i387_ia32(struct _fpstate_ia32 __user *buf) 488int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
475{ 489{
476 int err; 490 int err;
491 struct task_struct *tsk = current;
477 492
478 if (HAVE_HWFP) { 493 if (HAVE_HWFP)
479 struct task_struct *tsk = current;
480
481 clear_fpu(tsk); 494 clear_fpu(tsk);
482 495
483 if (!used_math()) { 496 if (!used_math()) {
484 err = init_fpu(tsk); 497 err = init_fpu(tsk);
485 if (err) 498 if (err)
486 return err; 499 return err;
487 } 500 }
488 501
502 if (HAVE_HWFP) {
489 if (cpu_has_fxsr) 503 if (cpu_has_fxsr)
490 err = restore_i387_fxsave(buf); 504 err = restore_i387_fxsave(buf);
491 else 505 else
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 3d01e47777db..a4f93b4120c1 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -11,7 +11,6 @@
11#include <asm/desc.h> 11#include <asm/desc.h>
12 12
13static struct fs_struct init_fs = INIT_FS; 13static struct fs_struct init_fs = INIT_FS;
14static struct files_struct init_files = INIT_FILES;
15static struct signal_struct init_signals = INIT_SIGNALS(init_signals); 14static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
16static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); 15static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
17struct mm_struct init_mm = INIT_MM(init_mm); 16struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index a40d54fc1fdd..4dc8600d9d20 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2130,14 +2130,10 @@ static inline void __init check_timer(void)
2130{ 2130{
2131 int apic1, pin1, apic2, pin2; 2131 int apic1, pin1, apic2, pin2;
2132 int vector; 2132 int vector;
2133 unsigned int ver;
2134 unsigned long flags; 2133 unsigned long flags;
2135 2134
2136 local_irq_save(flags); 2135 local_irq_save(flags);
2137 2136
2138 ver = apic_read(APIC_LVR);
2139 ver = GET_APIC_VERSION(ver);
2140
2141 /* 2137 /*
2142 * get/set the timer IRQ vector: 2138 * get/set the timer IRQ vector:
2143 */ 2139 */
@@ -2150,15 +2146,11 @@ static inline void __init check_timer(void)
2150 * mode for the 8259A whenever interrupts are routed 2146 * mode for the 8259A whenever interrupts are routed
2151 * through I/O APICs. Also IRQ0 has to be enabled in 2147 * through I/O APICs. Also IRQ0 has to be enabled in
2152 * the 8259A which implies the virtual wire has to be 2148 * the 8259A which implies the virtual wire has to be
2153 * disabled in the local APIC. Finally timer interrupts 2149 * disabled in the local APIC.
2154 * need to be acknowledged manually in the 8259A for
2155 * timer_interrupt() and for the i82489DX when using
2156 * the NMI watchdog.
2157 */ 2150 */
2158 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2151 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2159 init_8259A(1); 2152 init_8259A(1);
2160 timer_ack = !cpu_has_tsc; 2153 timer_ack = 1;
2161 timer_ack |= (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2162 if (timer_over_8254 > 0) 2154 if (timer_over_8254 > 0)
2163 enable_8259A_irq(0); 2155 enable_8259A_irq(0);
2164 2156
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 4bc1be5d5472..87edf1ceb1df 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/clocksource.h> 19#include <linux/clocksource.h>
20#include <linux/kvm_para.h> 20#include <linux/kvm_para.h>
21#include <asm/pvclock.h>
21#include <asm/arch_hooks.h> 22#include <asm/arch_hooks.h>
22#include <asm/msr.h> 23#include <asm/msr.h>
23#include <asm/apic.h> 24#include <asm/apic.h>
@@ -36,83 +37,47 @@ static int parse_no_kvmclock(char *arg)
36early_param("no-kvmclock", parse_no_kvmclock); 37early_param("no-kvmclock", parse_no_kvmclock);
37 38
38/* The hypervisor will put information about time periodically here */ 39/* The hypervisor will put information about time periodically here */
39static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); 40static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
40#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field 41static struct pvclock_wall_clock wall_clock;
41 42
42static inline u64 kvm_get_delta(u64 last_tsc)
43{
44 int cpu = smp_processor_id();
45 u64 delta = native_read_tsc() - last_tsc;
46 return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
47}
48
49static struct kvm_wall_clock wall_clock;
50static cycle_t kvm_clock_read(void);
51/* 43/*
52 * The wallclock is the time of day when we booted. Since then, some time may 44 * The wallclock is the time of day when we booted. Since then, some time may
53 * have elapsed since the hypervisor wrote the data. So we try to account for 45 * have elapsed since the hypervisor wrote the data. So we try to account for
54 * that with system time 46 * that with system time
55 */ 47 */
56unsigned long kvm_get_wallclock(void) 48static unsigned long kvm_get_wallclock(void)
57{ 49{
58 u32 wc_sec, wc_nsec; 50 struct pvclock_vcpu_time_info *vcpu_time;
59 u64 delta;
60 struct timespec ts; 51 struct timespec ts;
61 int version, nsec;
62 int low, high; 52 int low, high;
63 53
64 low = (int)__pa(&wall_clock); 54 low = (int)__pa(&wall_clock);
65 high = ((u64)__pa(&wall_clock) >> 32); 55 high = ((u64)__pa(&wall_clock) >> 32);
56 native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
66 57
67 delta = kvm_clock_read(); 58 vcpu_time = &get_cpu_var(hv_clock);
59 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
60 put_cpu_var(hv_clock);
68 61
69 native_write_msr(MSR_KVM_WALL_CLOCK, low, high); 62 return ts.tv_sec;
70 do {
71 version = wall_clock.wc_version;
72 rmb();
73 wc_sec = wall_clock.wc_sec;
74 wc_nsec = wall_clock.wc_nsec;
75 rmb();
76 } while ((wall_clock.wc_version != version) || (version & 1));
77
78 delta = kvm_clock_read() - delta;
79 delta += wc_nsec;
80 nsec = do_div(delta, NSEC_PER_SEC);
81 set_normalized_timespec(&ts, wc_sec + delta, nsec);
82 /*
83 * Of all mechanisms of time adjustment I've tested, this one
84 * was the champion!
85 */
86 return ts.tv_sec + 1;
87} 63}
88 64
89int kvm_set_wallclock(unsigned long now) 65static int kvm_set_wallclock(unsigned long now)
90{ 66{
91 return 0; 67 return -1;
92} 68}
93 69
94/*
95 * This is our read_clock function. The host puts an tsc timestamp each time
96 * it updates a new time. Without the tsc adjustment, we can have a situation
97 * in which a vcpu starts to run earlier (smaller system_time), but probes
98 * time later (compared to another vcpu), leading to backwards time
99 */
100static cycle_t kvm_clock_read(void) 70static cycle_t kvm_clock_read(void)
101{ 71{
102 u64 last_tsc, now; 72 struct pvclock_vcpu_time_info *src;
103 int cpu; 73 cycle_t ret;
104 74
105 preempt_disable(); 75 src = &get_cpu_var(hv_clock);
106 cpu = smp_processor_id(); 76 ret = pvclock_clocksource_read(src);
107 77 put_cpu_var(hv_clock);
108 last_tsc = get_clock(cpu, tsc_timestamp); 78 return ret;
109 now = get_clock(cpu, system_time);
110
111 now += kvm_get_delta(last_tsc);
112 preempt_enable();
113
114 return now;
115} 79}
80
116static struct clocksource kvm_clock = { 81static struct clocksource kvm_clock = {
117 .name = "kvm-clock", 82 .name = "kvm-clock",
118 .read = kvm_clock_read, 83 .read = kvm_clock_read,
@@ -123,13 +88,14 @@ static struct clocksource kvm_clock = {
123 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 88 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
124}; 89};
125 90
126static int kvm_register_clock(void) 91static int kvm_register_clock(char *txt)
127{ 92{
128 int cpu = smp_processor_id(); 93 int cpu = smp_processor_id();
129 int low, high; 94 int low, high;
130 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; 95 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
131 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 96 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
132 97 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
98 cpu, high, low, txt);
133 return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); 99 return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
134} 100}
135 101
@@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void)
140 * Now that the first cpu already had this clocksource initialized, 106 * Now that the first cpu already had this clocksource initialized,
141 * we shouldn't fail. 107 * we shouldn't fail.
142 */ 108 */
143 WARN_ON(kvm_register_clock()); 109 WARN_ON(kvm_register_clock("secondary cpu clock"));
144 /* ok, done with our trickery, call native */ 110 /* ok, done with our trickery, call native */
145 setup_secondary_APIC_clock(); 111 setup_secondary_APIC_clock();
146} 112}
147#endif 113#endif
148 114
115#ifdef CONFIG_SMP
116void __init kvm_smp_prepare_boot_cpu(void)
117{
118 WARN_ON(kvm_register_clock("primary cpu clock"));
119 native_smp_prepare_boot_cpu();
120}
121#endif
122
149/* 123/*
150 * After the clock is registered, the host will keep writing to the 124 * After the clock is registered, the host will keep writing to the
151 * registered memory location. If the guest happens to shutdown, this memory 125 * registered memory location. If the guest happens to shutdown, this memory
@@ -174,7 +148,7 @@ void __init kvmclock_init(void)
174 return; 148 return;
175 149
176 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { 150 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
177 if (kvm_register_clock()) 151 if (kvm_register_clock("boot clock"))
178 return; 152 return;
179 pv_time_ops.get_wallclock = kvm_get_wallclock; 153 pv_time_ops.get_wallclock = kvm_get_wallclock;
180 pv_time_ops.set_wallclock = kvm_set_wallclock; 154 pv_time_ops.set_wallclock = kvm_set_wallclock;
@@ -182,6 +156,9 @@ void __init kvmclock_init(void)
182#ifdef CONFIG_X86_LOCAL_APIC 156#ifdef CONFIG_X86_LOCAL_APIC
183 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; 157 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
184#endif 158#endif
159#ifdef CONFIG_SMP
160 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
161#endif
185 machine_ops.shutdown = kvm_shutdown; 162 machine_ops.shutdown = kvm_shutdown;
186#ifdef CONFIG_KEXEC 163#ifdef CONFIG_KEXEC
187 machine_ops.crash_shutdown = kvm_crash_shutdown; 164 machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 3cad17fe026b..07c0f828f488 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -155,6 +155,7 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
155 wrmsr(msr, value, dummy); 155 wrmsr(msr, value, dummy);
156 return 0; 156 return 0;
157} 157}
158EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
158 159
159int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) 160int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable)
160{ 161{
@@ -222,6 +223,7 @@ int geode_mfgpt_alloc_timer(int timer, int domain)
222 /* No timers available - too bad */ 223 /* No timers available - too bad */
223 return -1; 224 return -1;
224} 225}
226EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
225 227
226 228
227#ifdef CONFIG_GEODE_MFGPT_TIMER 229#ifdef CONFIG_GEODE_MFGPT_TIMER
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 11b14bbaa61e..84160f74eeb0 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -26,7 +26,6 @@
26 26
27#include <asm/smp.h> 27#include <asm/smp.h>
28#include <asm/nmi.h> 28#include <asm/nmi.h>
29#include <asm/timer.h>
30 29
31#include "mach_traps.h" 30#include "mach_traps.h"
32 31
@@ -82,7 +81,7 @@ int __init check_nmi_watchdog(void)
82 81
83 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); 82 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
84 if (!prev_nmi_count) 83 if (!prev_nmi_count)
85 goto error; 84 return -1;
86 85
87 printk(KERN_INFO "Testing NMI watchdog ... "); 86 printk(KERN_INFO "Testing NMI watchdog ... ");
88 87
@@ -119,7 +118,7 @@ int __init check_nmi_watchdog(void)
119 if (!atomic_read(&nmi_active)) { 118 if (!atomic_read(&nmi_active)) {
120 kfree(prev_nmi_count); 119 kfree(prev_nmi_count);
121 atomic_set(&nmi_active, -1); 120 atomic_set(&nmi_active, -1);
122 goto error; 121 return -1;
123 } 122 }
124 printk("OK.\n"); 123 printk("OK.\n");
125 124
@@ -130,10 +129,6 @@ int __init check_nmi_watchdog(void)
130 129
131 kfree(prev_nmi_count); 130 kfree(prev_nmi_count);
132 return 0; 131 return 0;
133error:
134 timer_ack = !cpu_has_tsc;
135
136 return -1;
137} 132}
138 133
139static int __init setup_nmi_watchdog(char *str) 134static int __init setup_nmi_watchdog(char *str)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index c5ef1af8e79d..dc00a1331ace 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -378,6 +378,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
378 struct page *page; 378 struct page *page;
379 unsigned long dma_mask = 0; 379 unsigned long dma_mask = 0;
380 dma_addr_t bus; 380 dma_addr_t bus;
381 int noretry = 0;
381 382
382 /* ignore region specifiers */ 383 /* ignore region specifiers */
383 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); 384 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
@@ -397,20 +398,25 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
397 if (dev->dma_mask == NULL) 398 if (dev->dma_mask == NULL)
398 return NULL; 399 return NULL;
399 400
400 /* Don't invoke OOM killer */ 401 /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
401 gfp |= __GFP_NORETRY; 402 if (gfp & __GFP_DMA)
403 noretry = 1;
402 404
403#ifdef CONFIG_X86_64 405#ifdef CONFIG_X86_64
404 /* Why <=? Even when the mask is smaller than 4GB it is often 406 /* Why <=? Even when the mask is smaller than 4GB it is often
405 larger than 16MB and in this case we have a chance of 407 larger than 16MB and in this case we have a chance of
406 finding fitting memory in the next higher zone first. If 408 finding fitting memory in the next higher zone first. If
407 not retry with true GFP_DMA. -AK */ 409 not retry with true GFP_DMA. -AK */
408 if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) 410 if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
409 gfp |= GFP_DMA32; 411 gfp |= GFP_DMA32;
412 if (dma_mask < DMA_32BIT_MASK)
413 noretry = 1;
414 }
410#endif 415#endif
411 416
412 again: 417 again:
413 page = dma_alloc_pages(dev, gfp, get_order(size)); 418 page = dma_alloc_pages(dev,
419 noretry ? gfp | __GFP_NORETRY : gfp, get_order(size));
414 if (page == NULL) 420 if (page == NULL)
415 return NULL; 421 return NULL;
416 422
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c07455d1695f..aa8ec928caa8 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -26,6 +26,7 @@
26#include <linux/kdebug.h> 26#include <linux/kdebug.h>
27#include <linux/scatterlist.h> 27#include <linux/scatterlist.h>
28#include <linux/iommu-helper.h> 28#include <linux/iommu-helper.h>
29#include <linux/sysdev.h>
29#include <asm/atomic.h> 30#include <asm/atomic.h>
30#include <asm/io.h> 31#include <asm/io.h>
31#include <asm/mtrr.h> 32#include <asm/mtrr.h>
@@ -548,6 +549,28 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
548 return aper_base; 549 return aper_base;
549} 550}
550 551
552static int gart_resume(struct sys_device *dev)
553{
554 return 0;
555}
556
557static int gart_suspend(struct sys_device *dev, pm_message_t state)
558{
559 return -EINVAL;
560}
561
562static struct sysdev_class gart_sysdev_class = {
563 .name = "gart",
564 .suspend = gart_suspend,
565 .resume = gart_resume,
566
567};
568
569static struct sys_device device_gart = {
570 .id = 0,
571 .cls = &gart_sysdev_class,
572};
573
551/* 574/*
552 * Private Northbridge GATT initialization in case we cannot use the 575 * Private Northbridge GATT initialization in case we cannot use the
553 * AGP driver for some reason. 576 * AGP driver for some reason.
@@ -558,7 +581,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
558 unsigned aper_base, new_aper_base; 581 unsigned aper_base, new_aper_base;
559 struct pci_dev *dev; 582 struct pci_dev *dev;
560 void *gatt; 583 void *gatt;
561 int i; 584 int i, error;
562 585
563 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 586 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
564 aper_size = aper_base = info->aper_size = 0; 587 aper_size = aper_base = info->aper_size = 0;
@@ -606,6 +629,12 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
606 629
607 pci_write_config_dword(dev, 0x90, ctl); 630 pci_write_config_dword(dev, 0x90, ctl);
608 } 631 }
632
633 error = sysdev_class_register(&gart_sysdev_class);
634 if (!error)
635 error = sysdev_register(&device_gart);
636 if (error)
637 panic("Could not register gart_sysdev -- would corrupt data on next suspend");
609 flush_gart(); 638 flush_gart();
610 639
611 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 640 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f8476dfbb60d..e2db9ac5c61c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -333,6 +333,7 @@ void flush_thread(void)
333 /* 333 /*
334 * Forget coprocessor state.. 334 * Forget coprocessor state..
335 */ 335 */
336 tsk->fpu_counter = 0;
336 clear_fpu(tsk); 337 clear_fpu(tsk);
337 clear_used_math(); 338 clear_used_math();
338} 339}
@@ -649,8 +650,11 @@ struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct
649 /* If the task has used fpu the last 5 timeslices, just do a full 650 /* If the task has used fpu the last 5 timeslices, just do a full
650 * restore of the math state immediately to avoid the trap; the 651 * restore of the math state immediately to avoid the trap; the
651 * chances of needing FPU soon are obviously high now 652 * chances of needing FPU soon are obviously high now
653 *
654 * tsk_used_math() checks prevent calling math_state_restore(),
655 * which can sleep in the case of !tsk_used_math()
652 */ 656 */
653 if (next_p->fpu_counter > 5) 657 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
654 math_state_restore(); 658 math_state_restore();
655 659
656 /* 660 /*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e2319f39988b..c6eb5c91e5f6 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -294,6 +294,7 @@ void flush_thread(void)
294 /* 294 /*
295 * Forget coprocessor state.. 295 * Forget coprocessor state..
296 */ 296 */
297 tsk->fpu_counter = 0;
297 clear_fpu(tsk); 298 clear_fpu(tsk);
298 clear_used_math(); 299 clear_used_math();
299} 300}
@@ -658,8 +659,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
658 /* If the task has used fpu the last 5 timeslices, just do a full 659 /* If the task has used fpu the last 5 timeslices, just do a full
659 * restore of the math state immediately to avoid the trap; the 660 * restore of the math state immediately to avoid the trap; the
660 * chances of needing FPU soon are obviously high now 661 * chances of needing FPU soon are obviously high now
662 *
663 * tsk_used_math() checks prevent calling math_state_restore(),
664 * which can sleep in the case of !tsk_used_math()
661 */ 665 */
662 if (next_p->fpu_counter>5) 666 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
663 math_state_restore(); 667 math_state_restore();
664 return prev_p; 668 return prev_p;
665} 669}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
new file mode 100644
index 000000000000..05fbe9a0325a
--- /dev/null
+++ b/arch/x86/kernel/pvclock.c
@@ -0,0 +1,141 @@
1/* paravirtual clock -- common code used by kvm/xen
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16*/
17
18#include <linux/kernel.h>
19#include <linux/percpu.h>
20#include <asm/pvclock.h>
21
22/*
23 * These are perodically updated
24 * xen: magic shared_info page
25 * kvm: gpa registered via msr
26 * and then copied here.
27 */
28struct pvclock_shadow_time {
29 u64 tsc_timestamp; /* TSC at last update of time vals. */
30 u64 system_timestamp; /* Time, in nanosecs, since boot. */
31 u32 tsc_to_nsec_mul;
32 int tsc_shift;
33 u32 version;
34};
35
36/*
37 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
38 * yielding a 64-bit result.
39 */
40static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
41{
42 u64 product;
43#ifdef __i386__
44 u32 tmp1, tmp2;
45#endif
46
47 if (shift < 0)
48 delta >>= -shift;
49 else
50 delta <<= shift;
51
52#ifdef __i386__
53 __asm__ (
54 "mul %5 ; "
55 "mov %4,%%eax ; "
56 "mov %%edx,%4 ; "
57 "mul %5 ; "
58 "xor %5,%5 ; "
59 "add %4,%%eax ; "
60 "adc %5,%%edx ; "
61 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
62 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
63#elif __x86_64__
64 __asm__ (
65 "mul %%rdx ; shrd $32,%%rdx,%%rax"
66 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
67#else
68#error implement me!
69#endif
70
71 return product;
72}
73
74static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
75{
76 u64 delta = native_read_tsc() - shadow->tsc_timestamp;
77 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
78}
79
80/*
81 * Reads a consistent set of time-base values from hypervisor,
82 * into a shadow data area.
83 */
84static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
85 struct pvclock_vcpu_time_info *src)
86{
87 do {
88 dst->version = src->version;
89 rmb(); /* fetch version before data */
90 dst->tsc_timestamp = src->tsc_timestamp;
91 dst->system_timestamp = src->system_time;
92 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
93 dst->tsc_shift = src->tsc_shift;
94 rmb(); /* test version after fetching data */
95 } while ((src->version & 1) || (dst->version != src->version));
96
97 return dst->version;
98}
99
100cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
101{
102 struct pvclock_shadow_time shadow;
103 unsigned version;
104 cycle_t ret, offset;
105
106 do {
107 version = pvclock_get_time_values(&shadow, src);
108 barrier();
109 offset = pvclock_get_nsec_offset(&shadow);
110 ret = shadow.system_timestamp + offset;
111 barrier();
112 } while (version != src->version);
113
114 return ret;
115}
116
117void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
118 struct pvclock_vcpu_time_info *vcpu_time,
119 struct timespec *ts)
120{
121 u32 version;
122 u64 delta;
123 struct timespec now;
124
125 /* get wallclock at system boot */
126 do {
127 version = wall_clock->version;
128 rmb(); /* fetch version before time */
129 now.tv_sec = wall_clock->sec;
130 now.tv_nsec = wall_clock->nsec;
131 rmb(); /* fetch time before checking version */
132 } while ((wall_clock->version & 1) || (version != wall_clock->version));
133
134 delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
135 delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
136
137 now.tv_nsec = do_div(delta, NSEC_PER_SEC);
138 now.tv_sec = delta;
139
140 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
141}
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 9615eee9b775..05191bbc68b8 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -4,6 +4,8 @@
4#include <linux/acpi.h> 4#include <linux/acpi.h>
5#include <linux/bcd.h> 5#include <linux/bcd.h>
6#include <linux/mc146818rtc.h> 6#include <linux/mc146818rtc.h>
7#include <linux/platform_device.h>
8#include <linux/pnp.h>
7 9
8#include <asm/time.h> 10#include <asm/time.h>
9#include <asm/vsyscall.h> 11#include <asm/vsyscall.h>
@@ -197,3 +199,35 @@ unsigned long long native_read_tsc(void)
197} 199}
198EXPORT_SYMBOL(native_read_tsc); 200EXPORT_SYMBOL(native_read_tsc);
199 201
202
203static struct resource rtc_resources[] = {
204 [0] = {
205 .start = RTC_PORT(0),
206 .end = RTC_PORT(1),
207 .flags = IORESOURCE_IO,
208 },
209 [1] = {
210 .start = RTC_IRQ,
211 .end = RTC_IRQ,
212 .flags = IORESOURCE_IRQ,
213 }
214};
215
216static struct platform_device rtc_device = {
217 .name = "rtc_cmos",
218 .id = -1,
219 .resource = rtc_resources,
220 .num_resources = ARRAY_SIZE(rtc_resources),
221};
222
223static __init int add_rtc_cmos(void)
224{
225#ifdef CONFIG_PNP
226 if (!pnp_platform_devices)
227 platform_device_register(&rtc_device);
228#else
229 platform_device_register(&rtc_device);
230#endif /* CONFIG_PNP */
231 return 0;
232}
233device_initcall(add_rtc_cmos);
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 2c5f8b213e86..5a2f8e063887 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -532,10 +532,16 @@ static void __init reserve_crashkernel(void)
532 (unsigned long)(crash_size >> 20), 532 (unsigned long)(crash_size >> 20),
533 (unsigned long)(crash_base >> 20), 533 (unsigned long)(crash_base >> 20),
534 (unsigned long)(total_mem >> 20)); 534 (unsigned long)(total_mem >> 20));
535
536 if (reserve_bootmem(crash_base, crash_size,
537 BOOTMEM_EXCLUSIVE) < 0) {
538 printk(KERN_INFO "crashkernel reservation "
539 "failed - memory is in use\n");
540 return;
541 }
542
535 crashk_res.start = crash_base; 543 crashk_res.start = crash_base;
536 crashk_res.end = crash_base + crash_size - 1; 544 crashk_res.end = crash_base + crash_size - 1;
537 reserve_bootmem(crash_base, crash_size,
538 BOOTMEM_DEFAULT);
539 } else 545 } else
540 printk(KERN_INFO "crashkernel reservation failed - " 546 printk(KERN_INFO "crashkernel reservation failed - "
541 "you have to specify a base address\n"); 547 "you have to specify a base address\n");
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 38988491c622..3e1cecedde42 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -996,7 +996,6 @@ do_rest:
996#endif 996#endif
997 cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */ 997 cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */
998 cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ 998 cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
999 cpu_clear(cpu, cpu_possible_map);
1000 cpu_clear(cpu, cpu_present_map); 999 cpu_clear(cpu, cpu_present_map);
1001 per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; 1000 per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
1002 } 1001 }
@@ -1190,6 +1189,7 @@ static void __init smp_cpu_index_default(void)
1190 */ 1189 */
1191void __init native_smp_prepare_cpus(unsigned int max_cpus) 1190void __init native_smp_prepare_cpus(unsigned int max_cpus)
1192{ 1191{
1192 preempt_disable();
1193 nmi_watchdog_default(); 1193 nmi_watchdog_default();
1194 smp_cpu_index_default(); 1194 smp_cpu_index_default();
1195 current_cpu_data = boot_cpu_data; 1195 current_cpu_data = boot_cpu_data;
@@ -1206,7 +1206,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1206 if (smp_sanity_check(max_cpus) < 0) { 1206 if (smp_sanity_check(max_cpus) < 0) {
1207 printk(KERN_INFO "SMP disabled\n"); 1207 printk(KERN_INFO "SMP disabled\n");
1208 disable_smp(); 1208 disable_smp();
1209 return; 1209 goto out;
1210 } 1210 }
1211 1211
1212 preempt_disable(); 1212 preempt_disable();
@@ -1246,6 +1246,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1246 printk(KERN_INFO "CPU%d: ", 0); 1246 printk(KERN_INFO "CPU%d: ", 0);
1247 print_cpu_info(&cpu_data(0)); 1247 print_cpu_info(&cpu_data(0));
1248 setup_boot_clock(); 1248 setup_boot_clock();
1249out:
1250 preempt_enable();
1249} 1251}
1250/* 1252/*
1251 * Early setup to make printk work. 1253 * Early setup to make printk work.
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index bde6f63e15d5..08d752de4eee 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -544,6 +544,7 @@ vm86_trap:
544#define DO_ERROR(trapnr, signr, str, name) \ 544#define DO_ERROR(trapnr, signr, str, name) \
545void do_##name(struct pt_regs *regs, long error_code) \ 545void do_##name(struct pt_regs *regs, long error_code) \
546{ \ 546{ \
547 trace_hardirqs_fixup(); \
547 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 548 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
548 == NOTIFY_STOP) \ 549 == NOTIFY_STOP) \
549 return; \ 550 return; \
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index e4790728b224..65b70637ad97 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -14,7 +14,10 @@
14 14
15#include "mach_timer.h" 15#include "mach_timer.h"
16 16
17static int tsc_enabled; 17/* native_sched_clock() is called before tsc_init(), so
18 we must start with the TSC soft disabled to prevent
19 erroneous rdtsc usage on !cpu_has_tsc processors */
20static int tsc_disabled = -1;
18 21
19/* 22/*
20 * On some systems the TSC frequency does not 23 * On some systems the TSC frequency does not
@@ -28,8 +31,8 @@ EXPORT_SYMBOL_GPL(tsc_khz);
28static int __init tsc_setup(char *str) 31static int __init tsc_setup(char *str)
29{ 32{
30 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " 33 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
31 "cannot disable TSC completely.\n"); 34 "cannot disable TSC completely.\n");
32 mark_tsc_unstable("user disabled TSC"); 35 tsc_disabled = 1;
33 return 1; 36 return 1;
34} 37}
35#else 38#else
@@ -120,7 +123,7 @@ unsigned long long native_sched_clock(void)
120 * very important for it to be as fast as the platform 123 * very important for it to be as fast as the platform
121 * can achive it. ) 124 * can achive it. )
122 */ 125 */
123 if (unlikely(!tsc_enabled && !tsc_unstable)) 126 if (unlikely(tsc_disabled))
124 /* No locking but a rare wrong value is not a big deal: */ 127 /* No locking but a rare wrong value is not a big deal: */
125 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); 128 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
126 129
@@ -322,7 +325,6 @@ void mark_tsc_unstable(char *reason)
322{ 325{
323 if (!tsc_unstable) { 326 if (!tsc_unstable) {
324 tsc_unstable = 1; 327 tsc_unstable = 1;
325 tsc_enabled = 0;
326 printk("Marking TSC unstable due to: %s.\n", reason); 328 printk("Marking TSC unstable due to: %s.\n", reason);
327 /* Can be called before registration */ 329 /* Can be called before registration */
328 if (clocksource_tsc.mult) 330 if (clocksource_tsc.mult)
@@ -336,7 +338,7 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable);
336static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d) 338static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
337{ 339{
338 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", 340 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
339 d->ident); 341 d->ident);
340 tsc_unstable = 1; 342 tsc_unstable = 1;
341 return 0; 343 return 0;
342} 344}
@@ -403,7 +405,7 @@ void __init tsc_init(void)
403{ 405{
404 int cpu; 406 int cpu;
405 407
406 if (!cpu_has_tsc) 408 if (!cpu_has_tsc || tsc_disabled > 0)
407 return; 409 return;
408 410
409 cpu_khz = calculate_cpu_khz(); 411 cpu_khz = calculate_cpu_khz();
@@ -414,6 +416,9 @@ void __init tsc_init(void)
414 return; 416 return;
415 } 417 }
416 418
419 /* now allow native_sched_clock() to use rdtsc */
420 tsc_disabled = 0;
421
417 printk("Detected %lu.%03lu MHz processor.\n", 422 printk("Detected %lu.%03lu MHz processor.\n",
418 (unsigned long)cpu_khz / 1000, 423 (unsigned long)cpu_khz / 1000,
419 (unsigned long)cpu_khz % 1000); 424 (unsigned long)cpu_khz % 1000);
@@ -441,8 +446,6 @@ void __init tsc_init(void)
441 if (check_tsc_unstable()) { 446 if (check_tsc_unstable()) {
442 clocksource_tsc.rating = 0; 447 clocksource_tsc.rating = 0;
443 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; 448 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
444 } else 449 }
445 tsc_enabled = 1;
446
447 clocksource_register(&clocksource_tsc); 450 clocksource_register(&clocksource_tsc);
448} 451}
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index fcc16e58609e..1784b8077a12 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -227,14 +227,14 @@ void __init tsc_calibrate(void)
227 /* hpet or pmtimer available ? */ 227 /* hpet or pmtimer available ? */
228 if (!hpet && !pm1 && !pm2) { 228 if (!hpet && !pm1 && !pm2) {
229 printk(KERN_INFO "TSC calibrated against PIT\n"); 229 printk(KERN_INFO "TSC calibrated against PIT\n");
230 return; 230 goto out;
231 } 231 }
232 232
233 /* Check, whether the sampling was disturbed by an SMI */ 233 /* Check, whether the sampling was disturbed by an SMI */
234 if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) { 234 if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) {
235 printk(KERN_WARNING "TSC calibration disturbed by SMI, " 235 printk(KERN_WARNING "TSC calibration disturbed by SMI, "
236 "using PIT calibration result\n"); 236 "using PIT calibration result\n");
237 return; 237 goto out;
238 } 238 }
239 239
240 tsc2 = (tsc2 - tsc1) * 1000000L; 240 tsc2 = (tsc2 - tsc1) * 1000000L;
@@ -255,6 +255,7 @@ void __init tsc_calibrate(void)
255 255
256 tsc_khz = tsc2 / tsc1; 256 tsc_khz = tsc2 / tsc1;
257 257
258out:
258 for_each_possible_cpu(cpu) 259 for_each_possible_cpu(cpu)
259 set_cyc2ns_scale(tsc_khz, cpu); 260 set_cyc2ns_scale(tsc_khz, cpu);
260} 261}
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 3324d90038e4..3829aa7b663f 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,10 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
200 200
201 atomic_inc(&pt->pending); 201 atomic_inc(&pt->pending);
202 smp_mb__after_atomic_inc(); 202 smp_mb__after_atomic_inc();
203 /* FIXME: handle case where the guest is in guest mode */ 203 if (vcpu0) {
204 if (vcpu0 && waitqueue_active(&vcpu0->wq)) { 204 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
205 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; 205 if (waitqueue_active(&vcpu0->wq)) {
206 wake_up_interruptible(&vcpu0->wq); 206 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
207 wake_up_interruptible(&vcpu0->wq);
208 }
207 } 209 }
208 210
209 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); 211 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
@@ -216,7 +218,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
216{ 218{
217 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 219 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
218 220
219 if (pit && vcpu->vcpu_id == 0) 221 if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending)
220 return atomic_read(&pit->pit_state.pit_timer.pending); 222 return atomic_read(&pit->pit_state.pit_timer.pending);
221 223
222 return 0; 224 return 0;
@@ -237,6 +239,19 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
237 return HRTIMER_NORESTART; 239 return HRTIMER_NORESTART;
238} 240}
239 241
242void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
243{
244 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
245 struct hrtimer *timer;
246
247 if (vcpu->vcpu_id != 0 || !pit)
248 return;
249
250 timer = &pit->pit_state.pit_timer.timer;
251 if (hrtimer_cancel(timer))
252 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
253}
254
240static void destroy_pit_timer(struct kvm_kpit_timer *pt) 255static void destroy_pit_timer(struct kvm_kpit_timer *pt)
241{ 256{
242 pr_debug("pit: execute del timer!\n"); 257 pr_debug("pit: execute del timer!\n");
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index ce1f583459b1..76d736b5f664 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -94,3 +94,9 @@ void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
94 /* TODO: PIT, RTC etc. */ 94 /* TODO: PIT, RTC etc. */
95} 95}
96EXPORT_SYMBOL_GPL(kvm_timer_intr_post); 96EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
97
98void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
99{
100 __kvm_migrate_apic_timer(vcpu);
101 __kvm_migrate_pit_timer(vcpu);
102}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 1802134b836f..2a15be2275c0 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -84,6 +84,8 @@ void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
84void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); 84void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
85void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); 85void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
86void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); 86void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
87void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
88void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
87 89
88int pit_has_pending_timer(struct kvm_vcpu *vcpu); 90int pit_has_pending_timer(struct kvm_vcpu *vcpu);
89int apic_has_pending_timer(struct kvm_vcpu *vcpu); 91int apic_has_pending_timer(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 36809d79788b..ebc03f5ae162 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
940 wait_queue_head_t *q = &apic->vcpu->wq; 940 wait_queue_head_t *q = &apic->vcpu->wq;
941 941
942 atomic_inc(&apic->timer.pending); 942 atomic_inc(&apic->timer.pending);
943 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
943 if (waitqueue_active(q)) { 944 if (waitqueue_active(q)) {
944 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 945 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
945 wake_up_interruptible(q); 946 wake_up_interruptible(q);
@@ -957,7 +958,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
957{ 958{
958 struct kvm_lapic *lapic = vcpu->arch.apic; 959 struct kvm_lapic *lapic = vcpu->arch.apic;
959 960
960 if (lapic) 961 if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
961 return atomic_read(&lapic->timer.pending); 962 return atomic_read(&lapic->timer.pending);
962 963
963 return 0; 964 return 0;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 36c5406b1813..7e7c3969f7a2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
640 rmap_remove(kvm, spte); 640 rmap_remove(kvm, spte);
641 --kvm->stat.lpages; 641 --kvm->stat.lpages;
642 set_shadow_pte(spte, shadow_trap_nonpresent_pte); 642 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
643 spte = NULL;
643 write_protected = 1; 644 write_protected = 1;
644 } 645 }
645 spte = rmap_next(kvm, rmapp, spte); 646 spte = rmap_next(kvm, rmapp, spte);
@@ -658,7 +659,7 @@ static int is_empty_shadow_page(u64 *spt)
658 u64 *end; 659 u64 *end;
659 660
660 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) 661 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
661 if (*pos != shadow_trap_nonpresent_pte) { 662 if (is_shadow_present_pte(*pos)) {
662 printk(KERN_ERR "%s: %p %llx\n", __func__, 663 printk(KERN_ERR "%s: %p %llx\n", __func__,
663 pos, *pos); 664 pos, *pos);
664 return 0; 665 return 0;
@@ -1082,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1082 struct kvm_mmu_page *shadow; 1083 struct kvm_mmu_page *shadow;
1083 1084
1084 spte |= PT_WRITABLE_MASK; 1085 spte |= PT_WRITABLE_MASK;
1085 if (user_fault) {
1086 mmu_unshadow(vcpu->kvm, gfn);
1087 goto unshadowed;
1088 }
1089 1086
1090 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1087 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
1091 if (shadow || 1088 if (shadow ||
@@ -1102,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1102 } 1099 }
1103 } 1100 }
1104 1101
1105unshadowed:
1106
1107 if (pte_access & ACC_WRITE_MASK) 1102 if (pte_access & ACC_WRITE_MASK)
1108 mark_page_dirty(vcpu->kvm, gfn); 1103 mark_page_dirty(vcpu->kvm, gfn);
1109 1104
@@ -1580,11 +1575,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1580 u64 *spte, 1575 u64 *spte,
1581 const void *new) 1576 const void *new)
1582{ 1577{
1583 if ((sp->role.level != PT_PAGE_TABLE_LEVEL) 1578 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
1584 && !vcpu->arch.update_pte.largepage) { 1579 if (!vcpu->arch.update_pte.largepage ||
1585 ++vcpu->kvm->stat.mmu_pde_zapped; 1580 sp->role.glevels == PT32_ROOT_LEVEL) {
1586 return; 1581 ++vcpu->kvm->stat.mmu_pde_zapped;
1587 } 1582 return;
1583 }
1584 }
1588 1585
1589 ++vcpu->kvm->stat.mmu_pte_updated; 1586 ++vcpu->kvm->stat.mmu_pte_updated;
1590 if (sp->role.glevels == PT32_ROOT_LEVEL) 1587 if (sp->role.glevels == PT32_ROOT_LEVEL)
@@ -1858,6 +1855,7 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu)
1858 sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, 1855 sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
1859 struct kvm_mmu_page, link); 1856 struct kvm_mmu_page, link);
1860 kvm_mmu_zap_page(vcpu->kvm, sp); 1857 kvm_mmu_zap_page(vcpu->kvm, sp);
1858 cond_resched();
1861 } 1859 }
1862 free_page((unsigned long)vcpu->arch.mmu.pae_root); 1860 free_page((unsigned long)vcpu->arch.mmu.pae_root);
1863} 1861}
@@ -1996,7 +1994,7 @@ static struct shrinker mmu_shrinker = {
1996 .seeks = DEFAULT_SEEKS * 10, 1994 .seeks = DEFAULT_SEEKS * 10,
1997}; 1995};
1998 1996
1999void mmu_destroy_caches(void) 1997static void mmu_destroy_caches(void)
2000{ 1998{
2001 if (pte_chain_cache) 1999 if (pte_chain_cache)
2002 kmem_cache_destroy(pte_chain_cache); 2000 kmem_cache_destroy(pte_chain_cache);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 156fe10288ae..934c7b619396 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -418,7 +418,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
418 418
419 /* mmio */ 419 /* mmio */
420 if (is_error_pfn(pfn)) { 420 if (is_error_pfn(pfn)) {
421 pgprintk("gfn %x is mmio\n", walker.gfn); 421 pgprintk("gfn %lx is mmio\n", walker.gfn);
422 kvm_release_pfn_clean(pfn); 422 kvm_release_pfn_clean(pfn);
423 return 1; 423 return 1;
424 } 424 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ab22615eee89..6b0d5fa5bab3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -688,7 +688,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
688 delta = vcpu->arch.host_tsc - tsc_this; 688 delta = vcpu->arch.host_tsc - tsc_this;
689 svm->vmcb->control.tsc_offset += delta; 689 svm->vmcb->control.tsc_offset += delta;
690 vcpu->cpu = cpu; 690 vcpu->cpu = cpu;
691 kvm_migrate_apic_timer(vcpu); 691 kvm_migrate_timers(vcpu);
692 } 692 }
693 693
694 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 694 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bfe4db11989c..540e95179074 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
566 load_transition_efer(vmx); 566 load_transition_efer(vmx);
567} 567}
568 568
569static void vmx_load_host_state(struct vcpu_vmx *vmx) 569static void __vmx_load_host_state(struct vcpu_vmx *vmx)
570{ 570{
571 unsigned long flags; 571 unsigned long flags;
572 572
@@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
596 reload_host_efer(vmx); 596 reload_host_efer(vmx);
597} 597}
598 598
599static void vmx_load_host_state(struct vcpu_vmx *vmx)
600{
601 preempt_disable();
602 __vmx_load_host_state(vmx);
603 preempt_enable();
604}
605
599/* 606/*
600 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 607 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
601 * vcpu mutex is already taken. 608 * vcpu mutex is already taken.
@@ -608,7 +615,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
608 615
609 if (vcpu->cpu != cpu) { 616 if (vcpu->cpu != cpu) {
610 vcpu_clear(vmx); 617 vcpu_clear(vmx);
611 kvm_migrate_apic_timer(vcpu); 618 kvm_migrate_timers(vcpu);
612 vpid_sync_vcpu_all(vmx); 619 vpid_sync_vcpu_all(vmx);
613 } 620 }
614 621
@@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
654 661
655static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 662static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
656{ 663{
657 vmx_load_host_state(to_vmx(vcpu)); 664 __vmx_load_host_state(to_vmx(vcpu));
658} 665}
659 666
660static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 667static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
884 switch (msr_index) { 891 switch (msr_index) {
885#ifdef CONFIG_X86_64 892#ifdef CONFIG_X86_64
886 case MSR_EFER: 893 case MSR_EFER:
894 vmx_load_host_state(vmx);
887 ret = kvm_set_msr_common(vcpu, msr_index, data); 895 ret = kvm_set_msr_common(vcpu, msr_index, data);
888 if (vmx->host_state.loaded) {
889 reload_host_efer(vmx);
890 load_transition_efer(vmx);
891 }
892 break; 896 break;
893 case MSR_FS_BASE: 897 case MSR_FS_BASE:
894 vmcs_writel(GUEST_FS_BASE, data); 898 vmcs_writel(GUEST_FS_BASE, data);
@@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
910 guest_write_tsc(data); 914 guest_write_tsc(data);
911 break; 915 break;
912 default: 916 default:
917 vmx_load_host_state(vmx);
913 msr = find_msr_entry(vmx, msr_index); 918 msr = find_msr_entry(vmx, msr_index);
914 if (msr) { 919 if (msr) {
915 msr->data = data; 920 msr->data = data;
916 if (vmx->host_state.loaded)
917 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
918 break; 921 break;
919 } 922 }
920 ret = kvm_set_msr_common(vcpu, msr_index, data); 923 ret = kvm_set_msr_common(vcpu, msr_index, data);
@@ -1036,6 +1039,7 @@ static void hardware_enable(void *garbage)
1036static void hardware_disable(void *garbage) 1039static void hardware_disable(void *garbage)
1037{ 1040{
1038 asm volatile (ASM_VMX_VMXOFF : : : "cc"); 1041 asm volatile (ASM_VMX_VMXOFF : : : "cc");
1042 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1039} 1043}
1040 1044
1041static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 1045static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 21338bdb28ff..63a77caa59f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
492static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 492static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
493{ 493{
494 static int version; 494 static int version;
495 struct kvm_wall_clock wc; 495 struct pvclock_wall_clock wc;
496 struct timespec wc_ts; 496 struct timespec now, sys, boot;
497 497
498 if (!wall_clock) 498 if (!wall_clock)
499 return; 499 return;
@@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
502 502
503 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 503 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
504 504
505 wc_ts = current_kernel_time(); 505 /*
506 wc.wc_sec = wc_ts.tv_sec; 506 * The guest calculates current wall clock time by adding
507 wc.wc_nsec = wc_ts.tv_nsec; 507 * system time (updated by kvm_write_guest_time below) to the
508 wc.wc_version = version; 508 * wall clock specified here. guest system time equals host
509 * system time for us, thus we must fill in host boot time here.
510 */
511 now = current_kernel_time();
512 ktime_get_ts(&sys);
513 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
514
515 wc.sec = boot.tv_sec;
516 wc.nsec = boot.tv_nsec;
517 wc.version = version;
509 518
510 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 519 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
511 520
@@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
513 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 522 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
514} 523}
515 524
525static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
526{
527 uint32_t quotient, remainder;
528
529 /* Don't try to replace with do_div(), this one calculates
530 * "(dividend << 32) / divisor" */
531 __asm__ ( "divl %4"
532 : "=a" (quotient), "=d" (remainder)
533 : "0" (0), "1" (dividend), "r" (divisor) );
534 return quotient;
535}
536
537static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
538{
539 uint64_t nsecs = 1000000000LL;
540 int32_t shift = 0;
541 uint64_t tps64;
542 uint32_t tps32;
543
544 tps64 = tsc_khz * 1000LL;
545 while (tps64 > nsecs*2) {
546 tps64 >>= 1;
547 shift--;
548 }
549
550 tps32 = (uint32_t)tps64;
551 while (tps32 <= (uint32_t)nsecs) {
552 tps32 <<= 1;
553 shift++;
554 }
555
556 hv_clock->tsc_shift = shift;
557 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
558
559 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
560 __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
561 hv_clock->tsc_to_system_mul);
562}
563
516static void kvm_write_guest_time(struct kvm_vcpu *v) 564static void kvm_write_guest_time(struct kvm_vcpu *v)
517{ 565{
518 struct timespec ts; 566 struct timespec ts;
@@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
523 if ((!vcpu->time_page)) 571 if ((!vcpu->time_page))
524 return; 572 return;
525 573
574 if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
575 kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
576 vcpu->hv_clock_tsc_khz = tsc_khz;
577 }
578
526 /* Keep irq disabled to prevent changes to the clock */ 579 /* Keep irq disabled to prevent changes to the clock */
527 local_irq_save(flags); 580 local_irq_save(flags);
528 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, 581 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
@@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
537 /* 590 /*
538 * The interface expects us to write an even number signaling that the 591 * The interface expects us to write an even number signaling that the
539 * update is finished. Since the guest won't see the intermediate 592 * update is finished. Since the guest won't see the intermediate
540 * state, we just write "2" at the end 593 * state, we just increase by 2 at the end.
541 */ 594 */
542 vcpu->hv_clock.version = 2; 595 vcpu->hv_clock.version += 2;
543 596
544 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 597 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
545 598
546 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 599 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
547 sizeof(vcpu->hv_clock)); 600 sizeof(vcpu->hv_clock));
548 601
549 kunmap_atomic(shared_kaddr, KM_USER0); 602 kunmap_atomic(shared_kaddr, KM_USER0);
550 603
@@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
599 /* ...but clean it before doing the actual write */ 652 /* ...but clean it before doing the actual write */
600 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 653 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
601 654
602 vcpu->arch.hv_clock.tsc_to_system_mul =
603 clocksource_khz2mult(tsc_khz, 22);
604 vcpu->arch.hv_clock.tsc_shift = 22;
605
606 down_read(&current->mm->mmap_sem); 655 down_read(&current->mm->mmap_sem);
607 vcpu->arch.time_page = 656 vcpu->arch.time_page =
608 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 657 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
@@ -2758,7 +2807,9 @@ again:
2758 2807
2759 if (vcpu->requests) { 2808 if (vcpu->requests) {
2760 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 2809 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2761 __kvm_migrate_apic_timer(vcpu); 2810 __kvm_migrate_timers(vcpu);
2811 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2812 kvm_x86_ops->tlb_flush(vcpu);
2762 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 2813 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
2763 &vcpu->requests)) { 2814 &vcpu->requests)) {
2764 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 2815 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -2772,6 +2823,7 @@ again:
2772 } 2823 }
2773 } 2824 }
2774 2825
2826 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
2775 kvm_inject_pending_timer_irqs(vcpu); 2827 kvm_inject_pending_timer_irqs(vcpu);
2776 2828
2777 preempt_disable(); 2829 preempt_disable();
@@ -2781,21 +2833,13 @@ again:
2781 2833
2782 local_irq_disable(); 2834 local_irq_disable();
2783 2835
2784 if (need_resched()) { 2836 if (vcpu->requests || need_resched()) {
2785 local_irq_enable(); 2837 local_irq_enable();
2786 preempt_enable(); 2838 preempt_enable();
2787 r = 1; 2839 r = 1;
2788 goto out; 2840 goto out;
2789 } 2841 }
2790 2842
2791 if (vcpu->requests)
2792 if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
2793 local_irq_enable();
2794 preempt_enable();
2795 r = 1;
2796 goto out;
2797 }
2798
2799 if (signal_pending(current)) { 2843 if (signal_pending(current)) {
2800 local_irq_enable(); 2844 local_irq_enable();
2801 preempt_enable(); 2845 preempt_enable();
@@ -2825,9 +2869,6 @@ again:
2825 2869
2826 kvm_guest_enter(); 2870 kvm_guest_enter();
2827 2871
2828 if (vcpu->requests)
2829 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2830 kvm_x86_ops->tlb_flush(vcpu);
2831 2872
2832 KVMTRACE_0D(VMENTRY, vcpu, entryexit); 2873 KVMTRACE_0D(VMENTRY, vcpu, entryexit);
2833 kvm_x86_ops->run(vcpu, kvm_run); 2874 kvm_x86_ops->run(vcpu, kvm_run);
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index f2a696d6a243..932f216d890c 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -677,8 +677,9 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
677 c->use_modrm_ea = 1; 677 c->use_modrm_ea = 1;
678 678
679 if (c->modrm_mod == 3) { 679 if (c->modrm_mod == 3) {
680 c->modrm_val = *(unsigned long *) 680 c->modrm_ptr = decode_register(c->modrm_rm,
681 decode_register(c->modrm_rm, c->regs, c->d & ByteOp); 681 c->regs, c->d & ByteOp);
682 c->modrm_val = *(unsigned long *)c->modrm_ptr;
682 return rc; 683 return rc;
683 } 684 }
684 685
@@ -1005,6 +1006,7 @@ done_prefixes:
1005 if ((c->d & ModRM) && c->modrm_mod == 3) { 1006 if ((c->d & ModRM) && c->modrm_mod == 3) {
1006 c->src.type = OP_REG; 1007 c->src.type = OP_REG;
1007 c->src.val = c->modrm_val; 1008 c->src.val = c->modrm_val;
1009 c->src.ptr = c->modrm_ptr;
1008 break; 1010 break;
1009 } 1011 }
1010 c->src.type = OP_MEM; 1012 c->src.type = OP_MEM;
@@ -1049,6 +1051,7 @@ done_prefixes:
1049 if ((c->d & ModRM) && c->modrm_mod == 3) { 1051 if ((c->d & ModRM) && c->modrm_mod == 3) {
1050 c->dst.type = OP_REG; 1052 c->dst.type = OP_REG;
1051 c->dst.val = c->dst.orig_val = c->modrm_val; 1053 c->dst.val = c->dst.orig_val = c->modrm_val;
1054 c->dst.ptr = c->modrm_ptr;
1052 break; 1055 break;
1053 } 1056 }
1054 c->dst.type = OP_MEM; 1057 c->dst.type = OP_MEM;
@@ -1724,7 +1727,8 @@ twobyte_insn:
1724 if (rc) 1727 if (rc)
1725 goto done; 1728 goto done;
1726 1729
1727 kvm_emulate_hypercall(ctxt->vcpu); 1730 /* Let the processor re-execute the fixed hypercall */
1731 c->eip = ctxt->vcpu->arch.rip;
1728 /* Disable writeback. */ 1732 /* Disable writeback. */
1729 c->dst.type = OP_NONE; 1733 c->dst.type = OP_NONE;
1730 break; 1734 break;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index af65b2da3ba0..5c7e2fd52075 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -582,8 +582,9 @@ static void __init lguest_init_IRQ(void)
582 int vector = FIRST_EXTERNAL_VECTOR + i; 582 int vector = FIRST_EXTERNAL_VECTOR + i;
583 if (vector != SYSCALL_VECTOR) { 583 if (vector != SYSCALL_VECTOR) {
584 set_intr_gate(vector, interrupt[i]); 584 set_intr_gate(vector, interrupt[i]);
585 set_irq_chip_and_handler(i, &lguest_irq_controller, 585 set_irq_chip_and_handler_name(i, &lguest_irq_controller,
586 handle_level_irq); 586 handle_level_irq,
587 "level");
587 } 588 }
588 } 589 }
589 /* This call is required to set up for 4k stacks, where we have 590 /* This call is required to set up for 4k stacks, where we have
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 70bebd310408..ee1c3f635157 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -217,19 +217,19 @@ ENTRY(copy_user_generic_unrolled)
217 /* table sorted by exception address */ 217 /* table sorted by exception address */
218 .section __ex_table,"a" 218 .section __ex_table,"a"
219 .align 8 219 .align 8
220 .quad .Ls1,.Ls1e 220 .quad .Ls1,.Ls1e /* Ls1-Ls4 have copied zero bytes */
221 .quad .Ls2,.Ls2e 221 .quad .Ls2,.Ls1e
222 .quad .Ls3,.Ls3e 222 .quad .Ls3,.Ls1e
223 .quad .Ls4,.Ls4e 223 .quad .Ls4,.Ls1e
224 .quad .Ld1,.Ls1e 224 .quad .Ld1,.Ls1e /* Ld1-Ld4 have copied 0-24 bytes */
225 .quad .Ld2,.Ls2e 225 .quad .Ld2,.Ls2e
226 .quad .Ld3,.Ls3e 226 .quad .Ld3,.Ls3e
227 .quad .Ld4,.Ls4e 227 .quad .Ld4,.Ls4e
228 .quad .Ls5,.Ls5e 228 .quad .Ls5,.Ls5e /* Ls5-Ls8 have copied 32 bytes */
229 .quad .Ls6,.Ls6e 229 .quad .Ls6,.Ls5e
230 .quad .Ls7,.Ls7e 230 .quad .Ls7,.Ls5e
231 .quad .Ls8,.Ls8e 231 .quad .Ls8,.Ls5e
232 .quad .Ld5,.Ls5e 232 .quad .Ld5,.Ls5e /* Ld5-Ld8 have copied 32-56 bytes */
233 .quad .Ld6,.Ls6e 233 .quad .Ld6,.Ls6e
234 .quad .Ld7,.Ls7e 234 .quad .Ld7,.Ls7e
235 .quad .Ld8,.Ls8e 235 .quad .Ld8,.Ls8e
@@ -244,11 +244,8 @@ ENTRY(copy_user_generic_unrolled)
244 .quad .Le5,.Le_zero 244 .quad .Le5,.Le_zero
245 .previous 245 .previous
246 246
247 /* compute 64-offset for main loop. 8 bytes accuracy with error on the
248 pessimistic side. this is gross. it would be better to fix the
249 interface. */
250 /* eax: zero, ebx: 64 */ 247 /* eax: zero, ebx: 64 */
251.Ls1e: addl $8,%eax 248.Ls1e: addl $8,%eax /* eax is bytes left uncopied within the loop (Ls1e: 64 .. Ls8e: 8) */
252.Ls2e: addl $8,%eax 249.Ls2e: addl $8,%eax
253.Ls3e: addl $8,%eax 250.Ls3e: addl $8,%eax
254.Ls4e: addl $8,%eax 251.Ls4e: addl $8,%eax
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
index 5196762b3b0e..9d3d1ab83763 100644
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -145,19 +145,19 @@ ENTRY(__copy_user_nocache)
145 /* table sorted by exception address */ 145 /* table sorted by exception address */
146 .section __ex_table,"a" 146 .section __ex_table,"a"
147 .align 8 147 .align 8
148 .quad .Ls1,.Ls1e 148 .quad .Ls1,.Ls1e /* .Ls[1-4] - 0 bytes copied */
149 .quad .Ls2,.Ls2e 149 .quad .Ls2,.Ls1e
150 .quad .Ls3,.Ls3e 150 .quad .Ls3,.Ls1e
151 .quad .Ls4,.Ls4e 151 .quad .Ls4,.Ls1e
152 .quad .Ld1,.Ls1e 152 .quad .Ld1,.Ls1e /* .Ld[1-4] - 0..24 bytes coped */
153 .quad .Ld2,.Ls2e 153 .quad .Ld2,.Ls2e
154 .quad .Ld3,.Ls3e 154 .quad .Ld3,.Ls3e
155 .quad .Ld4,.Ls4e 155 .quad .Ld4,.Ls4e
156 .quad .Ls5,.Ls5e 156 .quad .Ls5,.Ls5e /* .Ls[5-8] - 32 bytes copied */
157 .quad .Ls6,.Ls6e 157 .quad .Ls6,.Ls5e
158 .quad .Ls7,.Ls7e 158 .quad .Ls7,.Ls5e
159 .quad .Ls8,.Ls8e 159 .quad .Ls8,.Ls5e
160 .quad .Ld5,.Ls5e 160 .quad .Ld5,.Ls5e /* .Ld[5-8] - 32..56 bytes copied */
161 .quad .Ld6,.Ls6e 161 .quad .Ld6,.Ls6e
162 .quad .Ld7,.Ls7e 162 .quad .Ld7,.Ls7e
163 .quad .Ld8,.Ls8e 163 .quad .Ld8,.Ls8e
@@ -172,11 +172,8 @@ ENTRY(__copy_user_nocache)
172 .quad .Le5,.Le_zero 172 .quad .Le5,.Le_zero
173 .previous 173 .previous
174 174
175 /* compute 64-offset for main loop. 8 bytes accuracy with error on the
176 pessimistic side. this is gross. it would be better to fix the
177 interface. */
178 /* eax: zero, ebx: 64 */ 175 /* eax: zero, ebx: 64 */
179.Ls1e: addl $8,%eax 176.Ls1e: addl $8,%eax /* eax: bytes left uncopied: Ls1e: 64 .. Ls8e: 8 */
180.Ls2e: addl $8,%eax 177.Ls2e: addl $8,%eax
181.Ls3e: addl $8,%eax 178.Ls3e: addl $8,%eax
182.Ls4e: addl $8,%eax 179.Ls4e: addl $8,%eax
diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay_32.c
index 4535e6d147ad..d710f2d167bb 100644
--- a/arch/x86/lib/delay_32.c
+++ b/arch/x86/lib/delay_32.c
@@ -44,13 +44,36 @@ static void delay_loop(unsigned long loops)
44static void delay_tsc(unsigned long loops) 44static void delay_tsc(unsigned long loops)
45{ 45{
46 unsigned long bclock, now; 46 unsigned long bclock, now;
47 int cpu;
47 48
48 preempt_disable(); /* TSC's are per-cpu */ 49 preempt_disable();
50 cpu = smp_processor_id();
49 rdtscl(bclock); 51 rdtscl(bclock);
50 do { 52 for (;;) {
51 rep_nop();
52 rdtscl(now); 53 rdtscl(now);
53 } while ((now-bclock) < loops); 54 if ((now - bclock) >= loops)
55 break;
56
57 /* Allow RT tasks to run */
58 preempt_enable();
59 rep_nop();
60 preempt_disable();
61
62 /*
63 * It is possible that we moved to another CPU, and
64 * since TSC's are per-cpu we need to calculate
65 * that. The delay must guarantee that we wait "at
66 * least" the amount of time. Being moved to another
67 * CPU could make the wait longer but we just need to
68 * make sure we waited long enough. Rebalance the
69 * counter for this CPU.
70 */
71 if (unlikely(cpu != smp_processor_id())) {
72 loops -= (now - bclock);
73 cpu = smp_processor_id();
74 rdtscl(bclock);
75 }
76 }
54 preempt_enable(); 77 preempt_enable();
55} 78}
56 79
diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c
index bbc610518516..4c441be92641 100644
--- a/arch/x86/lib/delay_64.c
+++ b/arch/x86/lib/delay_64.c
@@ -31,14 +31,36 @@ int __devinit read_current_timer(unsigned long *timer_value)
31void __delay(unsigned long loops) 31void __delay(unsigned long loops)
32{ 32{
33 unsigned bclock, now; 33 unsigned bclock, now;
34 int cpu;
34 35
35 preempt_disable(); /* TSC's are pre-cpu */ 36 preempt_disable();
37 cpu = smp_processor_id();
36 rdtscl(bclock); 38 rdtscl(bclock);
37 do { 39 for (;;) {
38 rep_nop();
39 rdtscl(now); 40 rdtscl(now);
41 if ((now - bclock) >= loops)
42 break;
43
44 /* Allow RT tasks to run */
45 preempt_enable();
46 rep_nop();
47 preempt_disable();
48
49 /*
50 * It is possible that we moved to another CPU, and
51 * since TSC's are per-cpu we need to calculate
52 * that. The delay must guarantee that we wait "at
53 * least" the amount of time. Being moved to another
54 * CPU could make the wait longer but we just need to
55 * make sure we waited long enough. Rebalance the
56 * counter for this CPU.
57 */
58 if (unlikely(cpu != smp_processor_id())) {
59 loops -= (now - bclock);
60 cpu = smp_processor_id();
61 rdtscl(bclock);
62 }
40 } 63 }
41 while ((now-bclock) < loops);
42 preempt_enable(); 64 preempt_enable();
43} 65}
44EXPORT_SYMBOL(__delay); 66EXPORT_SYMBOL(__delay);
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 6e38d877ea77..c7b06feb139b 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -30,6 +30,7 @@
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
31#include <asm/desc.h> 31#include <asm/desc.h>
32#include <asm/user.h> 32#include <asm/user.h>
33#include <asm/i387.h>
33 34
34#include "fpu_system.h" 35#include "fpu_system.h"
35#include "fpu_emu.h" 36#include "fpu_emu.h"
@@ -146,6 +147,13 @@ asmlinkage void math_emulate(long arg)
146 unsigned long code_limit = 0; /* Initialized to stop compiler warnings */ 147 unsigned long code_limit = 0; /* Initialized to stop compiler warnings */
147 struct desc_struct code_descriptor; 148 struct desc_struct code_descriptor;
148 149
150 if (!used_math()) {
151 if (init_fpu(current)) {
152 do_group_exit(SIGKILL);
153 return;
154 }
155 }
156
149#ifdef RE_ENTRANT_CHECKING 157#ifdef RE_ENTRANT_CHECKING
150 if (emulating) { 158 if (emulating) {
151 printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n"); 159 printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n");
@@ -153,11 +161,6 @@ asmlinkage void math_emulate(long arg)
153 RE_ENTRANT_CHECK_ON; 161 RE_ENTRANT_CHECK_ON;
154#endif /* RE_ENTRANT_CHECKING */ 162#endif /* RE_ENTRANT_CHECKING */
155 163
156 if (!used_math()) {
157 finit();
158 set_used_math();
159 }
160
161 SETUP_DATA_AREA(arg); 164 SETUP_DATA_AREA(arg);
162 165
163 FPU_ORIG_EIP = FPU_EIP; 166 FPU_ORIG_EIP = FPU_EIP;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fd7e1798c75a..8bcb6f40ccb6 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -497,6 +497,11 @@ static int vmalloc_fault(unsigned long address)
497 unsigned long pgd_paddr; 497 unsigned long pgd_paddr;
498 pmd_t *pmd_k; 498 pmd_t *pmd_k;
499 pte_t *pte_k; 499 pte_t *pte_k;
500
501 /* Make sure we are in vmalloc area */
502 if (!(address >= VMALLOC_START && address < VMALLOC_END))
503 return -1;
504
500 /* 505 /*
501 * Synchronize this task's top level page-table 506 * Synchronize this task's top level page-table
502 * with the 'reference' page table. 507 * with the 'reference' page table.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 32ba13b0f818..819dad973b13 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -135,7 +135,7 @@ static __init void *spp_getpage(void)
135 return ptr; 135 return ptr;
136} 136}
137 137
138static void 138static __init void
139set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) 139set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
140{ 140{
141 pgd_t *pgd; 141 pgd_t *pgd;
@@ -206,7 +206,7 @@ void __init cleanup_highmap(void)
206 pmd_t *last_pmd = pmd + PTRS_PER_PMD; 206 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
207 207
208 for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { 208 for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
209 if (!pmd_present(*pmd)) 209 if (pmd_none(*pmd))
210 continue; 210 continue;
211 if (vaddr < (unsigned long) _text || vaddr > end) 211 if (vaddr < (unsigned long) _text || vaddr > end)
212 set_pmd(pmd, __pmd(0)); 212 set_pmd(pmd, __pmd(0));
@@ -214,7 +214,7 @@ void __init cleanup_highmap(void)
214} 214}
215 215
216/* NOTE: this is meant to be run only at boot */ 216/* NOTE: this is meant to be run only at boot */
217void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) 217void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
218{ 218{
219 unsigned long address = __fix_to_virt(idx); 219 unsigned long address = __fix_to_virt(idx);
220 220
@@ -506,7 +506,7 @@ early_param("memtest", parse_memtest);
506 506
507static void __init early_memtest(unsigned long start, unsigned long end) 507static void __init early_memtest(unsigned long start, unsigned long end)
508{ 508{
509 unsigned long t_start, t_size; 509 u64 t_start, t_size;
510 unsigned pattern; 510 unsigned pattern;
511 511
512 if (!memtest_pattern) 512 if (!memtest_pattern)
@@ -525,8 +525,9 @@ static void __init early_memtest(unsigned long start, unsigned long end)
525 if (t_start + t_size > end) 525 if (t_start + t_size > end)
526 t_size = end - t_start; 526 t_size = end - t_start;
527 527
528 printk(KERN_CONT "\n %016lx - %016lx pattern %d", 528 printk(KERN_CONT "\n %016llx - %016llx pattern %d",
529 t_start, t_start + t_size, pattern); 529 (unsigned long long)t_start,
530 (unsigned long long)t_start + t_size, pattern);
530 531
531 memtest(t_start, t_size, pattern); 532 memtest(t_start, t_size, pattern);
532 533
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 71bb3159031a..d1b867101e5f 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -300,6 +300,29 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
300} 300}
301EXPORT_SYMBOL(ioremap_cache); 301EXPORT_SYMBOL(ioremap_cache);
302 302
303static void __iomem *ioremap_default(resource_size_t phys_addr,
304 unsigned long size)
305{
306 unsigned long flags;
307 void *ret;
308 int err;
309
310 /*
311 * - WB for WB-able memory and no other conflicting mappings
312 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
313 * - Inherit from confliting mappings otherwise
314 */
315 err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
316 if (err < 0)
317 return NULL;
318
319 ret = (void *) __ioremap_caller(phys_addr, size, flags,
320 __builtin_return_address(0));
321
322 free_memtype(phys_addr, phys_addr + size);
323 return (void __iomem *)ret;
324}
325
303/** 326/**
304 * iounmap - Free a IO remapping 327 * iounmap - Free a IO remapping
305 * @addr: virtual address from ioremap_* 328 * @addr: virtual address from ioremap_*
@@ -365,7 +388,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
365 if (page_is_ram(start >> PAGE_SHIFT)) 388 if (page_is_ram(start >> PAGE_SHIFT))
366 return __va(phys); 389 return __va(phys);
367 390
368 addr = (void *)ioremap(start, PAGE_SIZE); 391 addr = (void *)ioremap_default(start, PAGE_SIZE);
369 if (addr) 392 if (addr)
370 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); 393 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
371 394
@@ -593,10 +616,11 @@ void __init early_iounmap(void *addr, unsigned long size)
593 unsigned long offset; 616 unsigned long offset;
594 unsigned int nrpages; 617 unsigned int nrpages;
595 enum fixed_addresses idx; 618 enum fixed_addresses idx;
596 unsigned int nesting; 619 int nesting;
597 620
598 nesting = --early_ioremap_nested; 621 nesting = --early_ioremap_nested;
599 WARN_ON(nesting < 0); 622 if (WARN_ON(nesting < 0))
623 return;
600 624
601 if (early_ioremap_debug) { 625 if (early_ioremap_debug) {
602 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, 626 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index de3a99812450..06b7a1c90fb8 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -34,7 +34,7 @@ void __cpuinit pat_disable(char *reason)
34 printk(KERN_INFO "%s\n", reason); 34 printk(KERN_INFO "%s\n", reason);
35} 35}
36 36
37static int nopat(char *str) 37static int __init nopat(char *str)
38{ 38{
39 pat_disable("PAT support disabled."); 39 pat_disable("PAT support disabled.");
40 return 0; 40 return 0;
@@ -151,32 +151,33 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
151 unsigned long pat_type; 151 unsigned long pat_type;
152 u8 mtrr_type; 152 u8 mtrr_type;
153 153
154 mtrr_type = mtrr_type_lookup(start, end);
155 if (mtrr_type == 0xFF) { /* MTRR not enabled */
156 *ret_prot = prot;
157 return 0;
158 }
159 if (mtrr_type == 0xFE) { /* MTRR match error */
160 *ret_prot = _PAGE_CACHE_UC;
161 return -1;
162 }
163 if (mtrr_type != MTRR_TYPE_UNCACHABLE &&
164 mtrr_type != MTRR_TYPE_WRBACK &&
165 mtrr_type != MTRR_TYPE_WRCOMB) { /* MTRR type unhandled */
166 *ret_prot = _PAGE_CACHE_UC;
167 return -1;
168 }
169
170 pat_type = prot & _PAGE_CACHE_MASK; 154 pat_type = prot & _PAGE_CACHE_MASK;
171 prot &= (~_PAGE_CACHE_MASK); 155 prot &= (~_PAGE_CACHE_MASK);
172 156
173 /* Currently doing intersection by hand. Optimize it later. */ 157 /*
158 * We return the PAT request directly for types where PAT takes
159 * precedence with respect to MTRR and for UC_MINUS.
160 * Consistency checks with other PAT requests is done later
161 * while going through memtype list.
162 */
174 if (pat_type == _PAGE_CACHE_WC) { 163 if (pat_type == _PAGE_CACHE_WC) {
175 *ret_prot = prot | _PAGE_CACHE_WC; 164 *ret_prot = prot | _PAGE_CACHE_WC;
165 return 0;
176 } else if (pat_type == _PAGE_CACHE_UC_MINUS) { 166 } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
177 *ret_prot = prot | _PAGE_CACHE_UC_MINUS; 167 *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
178 } else if (pat_type == _PAGE_CACHE_UC || 168 return 0;
179 mtrr_type == MTRR_TYPE_UNCACHABLE) { 169 } else if (pat_type == _PAGE_CACHE_UC) {
170 *ret_prot = prot | _PAGE_CACHE_UC;
171 return 0;
172 }
173
174 /*
175 * Look for MTRR hint to get the effective type in case where PAT
176 * request is for WB.
177 */
178 mtrr_type = mtrr_type_lookup(start, end);
179
180 if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
180 *ret_prot = prot | _PAGE_CACHE_UC; 181 *ret_prot = prot | _PAGE_CACHE_UC;
181 } else if (mtrr_type == MTRR_TYPE_WRCOMB) { 182 } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
182 *ret_prot = prot | _PAGE_CACHE_WC; 183 *ret_prot = prot | _PAGE_CACHE_WC;
@@ -233,14 +234,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
233 234
234 if (req_type == -1) { 235 if (req_type == -1) {
235 /* 236 /*
236 * Special case where caller wants to inherit from mtrr or 237 * Call mtrr_lookup to get the type hint. This is an
237 * existing pat mapping, defaulting to UC_MINUS in case of 238 * optimization for /dev/mem mmap'ers into WB memory (BIOS
238 * no match. 239 * tools and ACPI tools). Use WB request for WB memory and use
240 * UC_MINUS otherwise.
239 */ 241 */
240 u8 mtrr_type = mtrr_type_lookup(start, end); 242 u8 mtrr_type = mtrr_type_lookup(start, end);
241 if (mtrr_type == 0xFE) { /* MTRR match error */
242 err = -1;
243 }
244 243
245 if (mtrr_type == MTRR_TYPE_WRBACK) { 244 if (mtrr_type == MTRR_TYPE_WRBACK) {
246 req_type = _PAGE_CACHE_WB; 245 req_type = _PAGE_CACHE_WB;
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 3890234e5b26..99649dccad28 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -97,36 +97,9 @@ static __init inline int srat_disabled(void)
97 return numa_off || acpi_numa < 0; 97 return numa_off || acpi_numa < 0;
98} 98}
99 99
100/*
101 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
102 * up the NUMA heuristics which wants the local node to have a smaller
103 * distance than the others.
104 * Do some quick checks here and only use the SLIT if it passes.
105 */
106static __init int slit_valid(struct acpi_table_slit *slit)
107{
108 int i, j;
109 int d = slit->locality_count;
110 for (i = 0; i < d; i++) {
111 for (j = 0; j < d; j++) {
112 u8 val = slit->entry[d*i + j];
113 if (i == j) {
114 if (val != LOCAL_DISTANCE)
115 return 0;
116 } else if (val <= LOCAL_DISTANCE)
117 return 0;
118 }
119 }
120 return 1;
121}
122
123/* Callback for SLIT parsing */ 100/* Callback for SLIT parsing */
124void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 101void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
125{ 102{
126 if (!slit_valid(slit)) {
127 printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
128 return;
129 }
130 acpi_slit = slit; 103 acpi_slit = slit;
131} 104}
132 105
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 8545c8a9d107..6e64aaf00d1d 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -302,18 +302,18 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
302 }, 302 },
303 { 303 {
304 .callback = set_bf_sort, 304 .callback = set_bf_sort,
305 .ident = "HP ProLiant DL385 G2", 305 .ident = "HP ProLiant DL360",
306 .matches = { 306 .matches = {
307 DMI_MATCH(DMI_SYS_VENDOR, "HP"), 307 DMI_MATCH(DMI_SYS_VENDOR, "HP"),
308 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL385 G2"), 308 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL360"),
309 }, 309 },
310 }, 310 },
311 { 311 {
312 .callback = set_bf_sort, 312 .callback = set_bf_sort,
313 .ident = "HP ProLiant DL585 G2", 313 .ident = "HP ProLiant DL380",
314 .matches = { 314 .matches = {
315 DMI_MATCH(DMI_SYS_VENDOR, "HP"), 315 DMI_MATCH(DMI_SYS_VENDOR, "HP"),
316 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL585 G2"), 316 DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL380"),
317 }, 317 },
318 }, 318 },
319#ifdef __i386__ 319#ifdef __i386__
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index e70b9c57b88e..b821f4462d99 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -15,7 +15,8 @@ static __init int pci_access_init(void)
15 pci_mmcfg_early_init(); 15 pci_mmcfg_early_init();
16 16
17#ifdef CONFIG_PCI_OLPC 17#ifdef CONFIG_PCI_OLPC
18 pci_olpc_init(); 18 if (!pci_olpc_init())
19 return 0; /* skip additional checks if it's an XO */
19#endif 20#endif
20#ifdef CONFIG_PCI_BIOS 21#ifdef CONFIG_PCI_BIOS
21 pci_pcbios_init(); 22 pci_pcbios_init();
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 0908fca901bf..ca8df9c260bc 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -621,6 +621,13 @@ static __init int via_router_probe(struct irq_router *r,
621 */ 621 */
622 device = PCI_DEVICE_ID_VIA_8235; 622 device = PCI_DEVICE_ID_VIA_8235;
623 break; 623 break;
624 case PCI_DEVICE_ID_VIA_8237:
625 /**
626 * Asus a7v600 bios wrongly reports 8237
627 * as 586-compatible
628 */
629 device = PCI_DEVICE_ID_VIA_8237;
630 break;
624 } 631 }
625 } 632 }
626 633
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index 5e7636558c02..e11e9e803d5f 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -302,12 +302,13 @@ static struct pci_raw_ops pci_olpc_conf = {
302 .write = pci_olpc_write, 302 .write = pci_olpc_write,
303}; 303};
304 304
305void __init pci_olpc_init(void) 305int __init pci_olpc_init(void)
306{ 306{
307 if (!machine_is_olpc() || olpc_has_vsa()) 307 if (!machine_is_olpc() || olpc_has_vsa())
308 return; 308 return -ENODEV;
309 309
310 printk(KERN_INFO "PCI: Using configuration type OLPC\n"); 310 printk(KERN_INFO "PCI: Using configuration type OLPC\n");
311 raw_pci_ops = &pci_olpc_conf; 311 raw_pci_ops = &pci_olpc_conf;
312 is_lx = is_geode_lx(); 312 is_lx = is_geode_lx();
313 return 0;
313} 314}
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index f3972b12c60a..720c4c554534 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -101,7 +101,7 @@ extern struct pci_raw_ops pci_direct_conf1;
101extern int pci_direct_probe(void); 101extern int pci_direct_probe(void);
102extern void pci_direct_init(int type); 102extern void pci_direct_init(int type);
103extern void pci_pcbios_init(void); 103extern void pci_pcbios_init(void);
104extern void pci_olpc_init(void); 104extern int pci_olpc_init(void);
105 105
106/* pci-mmconfig.c */ 106/* pci-mmconfig.c */
107 107
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 23476c2ebfc4..efa2ba7c6005 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -106,9 +106,9 @@ int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
106 do_realtime((struct timespec *)tv); 106 do_realtime((struct timespec *)tv);
107 tv->tv_usec /= 1000; 107 tv->tv_usec /= 1000;
108 if (unlikely(tz != NULL)) { 108 if (unlikely(tz != NULL)) {
109 /* This relies on gcc inlining the memcpy. We'll notice 109 /* Avoid memcpy. Some old compilers fail to inline it */
110 if it ever fails to do so. */ 110 tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest;
111 memcpy(tz, &gtod->sys_tz, sizeof(struct timezone)); 111 tz->tz_dsttime = gtod->sys_tz.tz_dsttime;
112 } 112 }
113 return 0; 113 return 0;
114 } 114 }
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 2e641be2737e..6c388e593bc8 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -5,8 +5,9 @@
5config XEN 5config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK
8 depends on X86_32 9 depends on X86_32
9 depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
10 help 11 help
11 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
12 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c8a56e457d61..f09c1c69c37a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
785static __init void xen_pagetable_setup_start(pgd_t *base) 785static __init void xen_pagetable_setup_start(pgd_t *base)
786{ 786{
787 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; 787 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
788 int i;
788 789
789 /* special set_pte for pagetable initialization */ 790 /* special set_pte for pagetable initialization */
790 pv_mmu_ops.set_pte = xen_set_pte_init; 791 pv_mmu_ops.set_pte = xen_set_pte_init;
791 792
792 init_mm.pgd = base; 793 init_mm.pgd = base;
793 /* 794 /*
794 * copy top-level of Xen-supplied pagetable into place. For 795 * copy top-level of Xen-supplied pagetable into place. This
795 * !PAE we can use this as-is, but for PAE it is a stand-in 796 * is a stand-in while we copy the pmd pages.
796 * while we copy the pmd pages.
797 */ 797 */
798 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); 798 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
799 799
800 if (PTRS_PER_PMD > 1) { 800 /*
801 int i; 801 * For PAE, need to allocate new pmds, rather than
802 /* 802 * share Xen's, since Xen doesn't like pmd's being
803 * For PAE, need to allocate new pmds, rather than 803 * shared between address spaces.
804 * share Xen's, since Xen doesn't like pmd's being 804 */
805 * shared between address spaces. 805 for (i = 0; i < PTRS_PER_PGD; i++) {
806 */ 806 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
807 for (i = 0; i < PTRS_PER_PGD; i++) { 807 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
808 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
809 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
810 808
811 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), 809 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
812 PAGE_SIZE); 810 PAGE_SIZE);
813 811
814 make_lowmem_page_readonly(pmd); 812 make_lowmem_page_readonly(pmd);
815 813
816 set_pgd(&base[i], __pgd(1 + __pa(pmd))); 814 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
817 } else 815 } else
818 pgd_clear(&base[i]); 816 pgd_clear(&base[i]);
819 }
820 } 817 }
821 818
822 /* make sure zero_page is mapped RO so we can use it in pagetables */ 819 /* make sure zero_page is mapped RO so we can use it in pagetables */
@@ -873,17 +870,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
873 870
874 /* Actually pin the pagetable down, but we can't set PG_pinned 871 /* Actually pin the pagetable down, but we can't set PG_pinned
875 yet because the page structures don't exist yet. */ 872 yet because the page structures don't exist yet. */
876 { 873 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
877 unsigned level;
878
879#ifdef CONFIG_X86_PAE
880 level = MMUEXT_PIN_L3_TABLE;
881#else
882 level = MMUEXT_PIN_L2_TABLE;
883#endif
884
885 pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
886 }
887} 874}
888 875
889/* This is called once we have the cpu_possible_map */ 876/* This is called once we have the cpu_possible_map */
@@ -1093,7 +1080,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1093 .make_pte = xen_make_pte, 1080 .make_pte = xen_make_pte,
1094 .make_pgd = xen_make_pgd, 1081 .make_pgd = xen_make_pgd,
1095 1082
1096#ifdef CONFIG_X86_PAE
1097 .set_pte_atomic = xen_set_pte_atomic, 1083 .set_pte_atomic = xen_set_pte_atomic,
1098 .set_pte_present = xen_set_pte_at, 1084 .set_pte_present = xen_set_pte_at,
1099 .set_pud = xen_set_pud, 1085 .set_pud = xen_set_pud,
@@ -1102,7 +1088,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1102 1088
1103 .make_pmd = xen_make_pmd, 1089 .make_pmd = xen_make_pmd,
1104 .pmd_val = xen_pmd_val, 1090 .pmd_val = xen_pmd_val,
1105#endif /* PAE */
1106 1091
1107 .activate_mm = xen_activate_mm, 1092 .activate_mm = xen_activate_mm,
1108 .dup_mmap = xen_dup_mmap, 1093 .dup_mmap = xen_dup_mmap,
@@ -1228,6 +1213,11 @@ asmlinkage void __init xen_start_kernel(void)
1228 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1213 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1229 pv_info.kernel_rpl = 0; 1214 pv_info.kernel_rpl = 0;
1230 1215
1216 /* Prevent unwanted bits from being set in PTEs. */
1217 __supported_pte_mask &= ~_PAGE_GLOBAL;
1218 if (!is_initial_xendomain())
1219 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1220
1231 /* set the limit of our address space */ 1221 /* set the limit of our address space */
1232 xen_reserve_top(); 1222 xen_reserve_top();
1233 1223
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 126766d43aea..4e527e7893a8 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -60,7 +60,7 @@ xmaddr_t arbitrary_virt_to_machine(unsigned long address)
60{ 60{
61 unsigned int level; 61 unsigned int level;
62 pte_t *pte = lookup_address(address, &level); 62 pte_t *pte = lookup_address(address, &level);
63 unsigned offset = address & PAGE_MASK; 63 unsigned offset = address & ~PAGE_MASK;
64 64
65 BUG_ON(pte == NULL); 65 BUG_ON(pte == NULL);
66 66
@@ -179,50 +179,56 @@ out:
179 preempt_enable(); 179 preempt_enable();
180} 180}
181 181
182pteval_t xen_pte_val(pte_t pte) 182/* Assume pteval_t is equivalent to all the other *val_t types. */
183static pteval_t pte_mfn_to_pfn(pteval_t val)
184{
185 if (val & _PAGE_PRESENT) {
186 unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT;
187 pteval_t flags = val & ~PTE_MASK;
188 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
189 }
190
191 return val;
192}
193
194static pteval_t pte_pfn_to_mfn(pteval_t val)
183{ 195{
184 pteval_t ret = pte.pte; 196 if (val & _PAGE_PRESENT) {
197 unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT;
198 pteval_t flags = val & ~PTE_MASK;
199 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
200 }
185 201
186 if (ret & _PAGE_PRESENT) 202 return val;
187 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; 203}
188 204
189 return ret; 205pteval_t xen_pte_val(pte_t pte)
206{
207 return pte_mfn_to_pfn(pte.pte);
190} 208}
191 209
192pgdval_t xen_pgd_val(pgd_t pgd) 210pgdval_t xen_pgd_val(pgd_t pgd)
193{ 211{
194 pgdval_t ret = pgd.pgd; 212 return pte_mfn_to_pfn(pgd.pgd);
195 if (ret & _PAGE_PRESENT)
196 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
197 return ret;
198} 213}
199 214
200pte_t xen_make_pte(pteval_t pte) 215pte_t xen_make_pte(pteval_t pte)
201{ 216{
202 if (pte & _PAGE_PRESENT) { 217 pte = pte_pfn_to_mfn(pte);
203 pte = phys_to_machine(XPADDR(pte)).maddr; 218 return native_make_pte(pte);
204 pte &= ~(_PAGE_PCD | _PAGE_PWT);
205 }
206
207 return (pte_t){ .pte = pte };
208} 219}
209 220
210pgd_t xen_make_pgd(pgdval_t pgd) 221pgd_t xen_make_pgd(pgdval_t pgd)
211{ 222{
212 if (pgd & _PAGE_PRESENT) 223 pgd = pte_pfn_to_mfn(pgd);
213 pgd = phys_to_machine(XPADDR(pgd)).maddr; 224 return native_make_pgd(pgd);
214
215 return (pgd_t){ pgd };
216} 225}
217 226
218pmdval_t xen_pmd_val(pmd_t pmd) 227pmdval_t xen_pmd_val(pmd_t pmd)
219{ 228{
220 pmdval_t ret = native_pmd_val(pmd); 229 return pte_mfn_to_pfn(pmd.pmd);
221 if (ret & _PAGE_PRESENT)
222 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
223 return ret;
224} 230}
225#ifdef CONFIG_X86_PAE 231
226void xen_set_pud(pud_t *ptr, pud_t val) 232void xen_set_pud(pud_t *ptr, pud_t val)
227{ 233{
228 struct multicall_space mcs; 234 struct multicall_space mcs;
@@ -267,17 +273,9 @@ void xen_pmd_clear(pmd_t *pmdp)
267 273
268pmd_t xen_make_pmd(pmdval_t pmd) 274pmd_t xen_make_pmd(pmdval_t pmd)
269{ 275{
270 if (pmd & _PAGE_PRESENT) 276 pmd = pte_pfn_to_mfn(pmd);
271 pmd = phys_to_machine(XPADDR(pmd)).maddr;
272
273 return native_make_pmd(pmd); 277 return native_make_pmd(pmd);
274} 278}
275#else /* !PAE */
276void xen_set_pte(pte_t *ptep, pte_t pte)
277{
278 *ptep = pte;
279}
280#endif /* CONFIG_X86_PAE */
281 279
282/* 280/*
283 (Yet another) pagetable walker. This one is intended for pinning a 281 (Yet another) pagetable walker. This one is intended for pinning a
@@ -430,8 +428,6 @@ static int pin_page(struct page *page, enum pt_level level)
430 read-only, and can be pinned. */ 428 read-only, and can be pinned. */
431void xen_pgd_pin(pgd_t *pgd) 429void xen_pgd_pin(pgd_t *pgd)
432{ 430{
433 unsigned level;
434
435 xen_mc_batch(); 431 xen_mc_batch();
436 432
437 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 433 if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
@@ -441,14 +437,7 @@ void xen_pgd_pin(pgd_t *pgd)
441 xen_mc_batch(); 437 xen_mc_batch();
442 } 438 }
443 439
444#ifdef CONFIG_X86_PAE 440 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
445 level = MMUEXT_PIN_L3_TABLE;
446#else
447 level = MMUEXT_PIN_L2_TABLE;
448#endif
449
450 xen_do_pin(level, PFN_DOWN(__pa(pgd)));
451
452 xen_mc_issue(0); 441 xen_mc_issue(0);
453} 442}
454 443
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index b5e189b1519d..5fe961caffd4 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -37,14 +37,13 @@ void xen_exit_mmap(struct mm_struct *mm);
37void xen_pgd_pin(pgd_t *pgd); 37void xen_pgd_pin(pgd_t *pgd);
38//void xen_pgd_unpin(pgd_t *pgd); 38//void xen_pgd_unpin(pgd_t *pgd);
39 39
40#ifdef CONFIG_X86_PAE 40pteval_t xen_pte_val(pte_t);
41unsigned long long xen_pte_val(pte_t); 41pmdval_t xen_pmd_val(pmd_t);
42unsigned long long xen_pmd_val(pmd_t); 42pgdval_t xen_pgd_val(pgd_t);
43unsigned long long xen_pgd_val(pgd_t);
44 43
45pte_t xen_make_pte(unsigned long long); 44pte_t xen_make_pte(pteval_t);
46pmd_t xen_make_pmd(unsigned long long); 45pmd_t xen_make_pmd(pmdval_t);
47pgd_t xen_make_pgd(unsigned long long); 46pgd_t xen_make_pgd(pgdval_t);
48 47
49void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 48void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
50 pte_t *ptep, pte_t pteval); 49 pte_t *ptep, pte_t pteval);
@@ -53,15 +52,4 @@ void xen_set_pud(pud_t *ptr, pud_t val);
53void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 52void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
54void xen_pmd_clear(pmd_t *pmdp); 53void xen_pmd_clear(pmd_t *pmdp);
55 54
56
57#else
58unsigned long xen_pte_val(pte_t);
59unsigned long xen_pmd_val(pmd_t);
60unsigned long xen_pgd_val(pgd_t);
61
62pte_t xen_make_pte(unsigned long);
63pmd_t xen_make_pmd(unsigned long);
64pgd_t xen_make_pgd(unsigned long);
65#endif
66
67#endif /* _XEN_MMU_H */ 55#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index c39e1a5aa241..41e217503c96 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -12,7 +12,9 @@
12#include <linux/clocksource.h> 12#include <linux/clocksource.h>
13#include <linux/clockchips.h> 13#include <linux/clockchips.h>
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/math64.h>
15 16
17#include <asm/pvclock.h>
16#include <asm/xen/hypervisor.h> 18#include <asm/xen/hypervisor.h>
17#include <asm/xen/hypercall.h> 19#include <asm/xen/hypercall.h>
18 20
@@ -30,17 +32,6 @@
30 32
31static cycle_t xen_clocksource_read(void); 33static cycle_t xen_clocksource_read(void);
32 34
33/* These are perodically updated in shared_info, and then copied here. */
34struct shadow_time_info {
35 u64 tsc_timestamp; /* TSC at last update of time vals. */
36 u64 system_timestamp; /* Time, in nanosecs, since boot. */
37 u32 tsc_to_nsec_mul;
38 int tsc_shift;
39 u32 version;
40};
41
42static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
43
44/* runstate info updated by Xen */ 35/* runstate info updated by Xen */
45static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); 36static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
46 37
@@ -150,11 +141,7 @@ static void do_stolen_accounting(void)
150 if (stolen < 0) 141 if (stolen < 0)
151 stolen = 0; 142 stolen = 0;
152 143
153 ticks = 0; 144 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
154 while (stolen >= NS_PER_TICK) {
155 ticks++;
156 stolen -= NS_PER_TICK;
157 }
158 __get_cpu_var(residual_stolen) = stolen; 145 __get_cpu_var(residual_stolen) = stolen;
159 account_steal_time(NULL, ticks); 146 account_steal_time(NULL, ticks);
160 147
@@ -166,11 +153,7 @@ static void do_stolen_accounting(void)
166 if (blocked < 0) 153 if (blocked < 0)
167 blocked = 0; 154 blocked = 0;
168 155
169 ticks = 0; 156 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
170 while (blocked >= NS_PER_TICK) {
171 ticks++;
172 blocked -= NS_PER_TICK;
173 }
174 __get_cpu_var(residual_blocked) = blocked; 157 __get_cpu_var(residual_blocked) = blocked;
175 account_steal_time(idle_task(smp_processor_id()), ticks); 158 account_steal_time(idle_task(smp_processor_id()), ticks);
176} 159}
@@ -218,7 +201,7 @@ unsigned long long xen_sched_clock(void)
218unsigned long xen_cpu_khz(void) 201unsigned long xen_cpu_khz(void)
219{ 202{
220 u64 xen_khz = 1000000ULL << 32; 203 u64 xen_khz = 1000000ULL << 32;
221 const struct vcpu_time_info *info = 204 const struct pvclock_vcpu_time_info *info =
222 &HYPERVISOR_shared_info->vcpu_info[0].time; 205 &HYPERVISOR_shared_info->vcpu_info[0].time;
223 206
224 do_div(xen_khz, info->tsc_to_system_mul); 207 do_div(xen_khz, info->tsc_to_system_mul);
@@ -230,121 +213,26 @@ unsigned long xen_cpu_khz(void)
230 return xen_khz; 213 return xen_khz;
231} 214}
232 215
233/*
234 * Reads a consistent set of time-base values from Xen, into a shadow data
235 * area.
236 */
237static unsigned get_time_values_from_xen(void)
238{
239 struct vcpu_time_info *src;
240 struct shadow_time_info *dst;
241
242 /* src is shared memory with the hypervisor, so we need to
243 make sure we get a consistent snapshot, even in the face of
244 being preempted. */
245 src = &__get_cpu_var(xen_vcpu)->time;
246 dst = &__get_cpu_var(shadow_time);
247
248 do {
249 dst->version = src->version;
250 rmb(); /* fetch version before data */
251 dst->tsc_timestamp = src->tsc_timestamp;
252 dst->system_timestamp = src->system_time;
253 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
254 dst->tsc_shift = src->tsc_shift;
255 rmb(); /* test version after fetching data */
256 } while ((src->version & 1) | (dst->version ^ src->version));
257
258 return dst->version;
259}
260
261/*
262 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
263 * yielding a 64-bit result.
264 */
265static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
266{
267 u64 product;
268#ifdef __i386__
269 u32 tmp1, tmp2;
270#endif
271
272 if (shift < 0)
273 delta >>= -shift;
274 else
275 delta <<= shift;
276
277#ifdef __i386__
278 __asm__ (
279 "mul %5 ; "
280 "mov %4,%%eax ; "
281 "mov %%edx,%4 ; "
282 "mul %5 ; "
283 "xor %5,%5 ; "
284 "add %4,%%eax ; "
285 "adc %5,%%edx ; "
286 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
287 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
288#elif __x86_64__
289 __asm__ (
290 "mul %%rdx ; shrd $32,%%rdx,%%rax"
291 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
292#else
293#error implement me!
294#endif
295
296 return product;
297}
298
299static u64 get_nsec_offset(struct shadow_time_info *shadow)
300{
301 u64 now, delta;
302 now = native_read_tsc();
303 delta = now - shadow->tsc_timestamp;
304 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
305}
306
307static cycle_t xen_clocksource_read(void) 216static cycle_t xen_clocksource_read(void)
308{ 217{
309 struct shadow_time_info *shadow = &get_cpu_var(shadow_time); 218 struct pvclock_vcpu_time_info *src;
310 cycle_t ret; 219 cycle_t ret;
311 unsigned version;
312
313 do {
314 version = get_time_values_from_xen();
315 barrier();
316 ret = shadow->system_timestamp + get_nsec_offset(shadow);
317 barrier();
318 } while (version != __get_cpu_var(xen_vcpu)->time.version);
319
320 put_cpu_var(shadow_time);
321 220
221 src = &get_cpu_var(xen_vcpu)->time;
222 ret = pvclock_clocksource_read(src);
223 put_cpu_var(xen_vcpu);
322 return ret; 224 return ret;
323} 225}
324 226
325static void xen_read_wallclock(struct timespec *ts) 227static void xen_read_wallclock(struct timespec *ts)
326{ 228{
327 const struct shared_info *s = HYPERVISOR_shared_info; 229 struct shared_info *s = HYPERVISOR_shared_info;
328 u32 version; 230 struct pvclock_wall_clock *wall_clock = &(s->wc);
329 u64 delta; 231 struct pvclock_vcpu_time_info *vcpu_time;
330 struct timespec now;
331
332 /* get wallclock at system boot */
333 do {
334 version = s->wc_version;
335 rmb(); /* fetch version before time */
336 now.tv_sec = s->wc_sec;
337 now.tv_nsec = s->wc_nsec;
338 rmb(); /* fetch time before checking version */
339 } while ((s->wc_version & 1) | (version ^ s->wc_version));
340 232
341 delta = xen_clocksource_read(); /* time since system boot */ 233 vcpu_time = &get_cpu_var(xen_vcpu)->time;
342 delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; 234 pvclock_read_wallclock(wall_clock, vcpu_time, ts);
343 235 put_cpu_var(xen_vcpu);
344 now.tv_nsec = do_div(delta, NSEC_PER_SEC);
345 now.tv_sec = delta;
346
347 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
348} 236}
349 237
350unsigned long xen_get_wallclock(void) 238unsigned long xen_get_wallclock(void)
@@ -352,7 +240,6 @@ unsigned long xen_get_wallclock(void)
352 struct timespec ts; 240 struct timespec ts;
353 241
354 xen_read_wallclock(&ts); 242 xen_read_wallclock(&ts);
355
356 return ts.tv_sec; 243 return ts.tv_sec;
357} 244}
358 245
@@ -576,8 +463,6 @@ __init void xen_time_init(void)
576{ 463{
577 int cpu = smp_processor_id(); 464 int cpu = smp_processor_id();
578 465
579 get_time_values_from_xen();
580
581 clocksource_register(&xen_clocksource); 466 clocksource_register(&xen_clocksource);
582 467
583 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { 468 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 288d587ce73c..6ec3b4f7719b 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -17,7 +17,7 @@ ENTRY(startup_xen)
17 17
18 __FINIT 18 __FINIT
19 19
20.pushsection .bss.page_aligned 20.pushsection .text
21 .align PAGE_SIZE_asm 21 .align PAGE_SIZE_asm
22ENTRY(hypercall_page) 22ENTRY(hypercall_page)
23 .skip 0x1000 23 .skip 0x1000
@@ -30,11 +30,7 @@ ENTRY(hypercall_page)
30 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) 30 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
31 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) 31 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
32 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 32 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
33#ifdef CONFIG_X86_PAE
34 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 33 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
35#else
36 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
37#endif
38 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 34 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
39 35
40#endif /*CONFIG_XEN */ 36#endif /*CONFIG_XEN */