aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2011-02-15 04:24:31 -0500
committerJiri Kosina <jkosina@suse.cz>2011-02-15 04:24:31 -0500
commit0a9d59a2461477bd9ed143c01af9df3f8f00fa81 (patch)
treedf997d1cfb0786427a0df1fbd6f0640fa4248cf4 /arch/x86
parenta23ce6da9677d245aa0aadc99f4197030350ab54 (diff)
parent795abaf1e4e188c4171e3cd3dbb11a9fcacaf505 (diff)
Merge branch 'master' into for-next
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig31
-rw-r--r--arch/x86/Kconfig.cpu2
-rw-r--r--arch/x86/Kconfig.debug4
-rw-r--r--arch/x86/include/asm/cacheflush.h42
-rw-r--r--arch/x86/include/asm/cpu.h1
-rw-r--r--arch/x86/include/asm/jump_label.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h1
-rw-r--r--arch/x86/include/asm/mmu_context.h5
-rw-r--r--arch/x86/include/asm/numa_32.h2
-rw-r--r--arch/x86/include/asm/numa_64.h1
-rw-r--r--arch/x86/include/asm/paravirt.h24
-rw-r--r--arch/x86/include/asm/paravirt_types.h6
-rw-r--r--arch/x86/include/asm/percpu.h32
-rw-r--r--arch/x86/include/asm/pgtable-2level.h9
-rw-r--r--arch/x86/include/asm/pgtable-3level.h23
-rw-r--r--arch/x86/include/asm/pgtable.h143
-rw-r--r--arch/x86/include/asm/pgtable_64.h28
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/processor.h5
-rw-r--r--arch/x86/include/asm/smp.h5
-rw-r--r--arch/x86/include/asm/system_64.h22
-rw-r--r--arch/x86/include/asm/xen/page.h16
-rw-r--r--arch/x86/kernel/acpi/boot.c1
-rw-r--r--arch/x86/kernel/acpi/sleep.c15
-rw-r--r--arch/x86/kernel/apb_timer.c14
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c10
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c12
-rw-r--r--arch/x86/kernel/dumpstack.c1
-rw-r--r--arch/x86/kernel/dumpstack_64.c2
-rw-r--r--arch/x86/kernel/e820.c1
-rw-r--r--arch/x86/kernel/head_32.S30
-rw-r--r--arch/x86/kernel/irq_32.c7
-rw-r--r--arch/x86/kernel/module.c17
-rw-r--r--arch/x86/kernel/paravirt.c3
-rw-r--r--arch/x86/kernel/process.c33
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c6
-rw-r--r--arch/x86/kernel/smpboot.c7
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/tsc.c4
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kvm/mmu.c125
-rw-r--r--arch/x86/kvm/paging_tmpl.h9
-rw-r--r--arch/x86/kvm/svm.c2
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/boot.c2
-rw-r--r--arch/x86/mm/gup.c28
-rw-r--r--arch/x86/mm/numa.c22
-rw-r--r--arch/x86/mm/numa_64.c24
-rw-r--r--arch/x86/mm/pageattr.c8
-rw-r--r--arch/x86/mm/pgtable.c66
-rw-r--r--arch/x86/mm/srat_32.c1
-rw-r--r--arch/x86/pci/broadcom_bus.c11
-rw-r--r--arch/x86/pci/common.c41
-rw-r--r--arch/x86/pci/irq.c3
-rw-r--r--arch/x86/platform/olpc/olpc-xo1.c101
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c2
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/mmu.c366
-rw-r--r--arch/x86/xen/p2m.c522
-rw-r--r--arch/x86/xen/setup.c8
64 files changed, 1262 insertions, 666 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 36ed2e2c896..d5ed94d30aa 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -627,11 +627,11 @@ config APB_TIMER
627 as it is off-chip. APB timers are always running regardless of CPU 627 as it is off-chip. APB timers are always running regardless of CPU
628 C states, they are used as per CPU clockevent device when possible. 628 C states, they are used as per CPU clockevent device when possible.
629 629
630# Mark as embedded because too many people got it wrong. 630# Mark as expert because too many people got it wrong.
631# The code disables itself when not needed. 631# The code disables itself when not needed.
632config DMI 632config DMI
633 default y 633 default y
634 bool "Enable DMI scanning" if EMBEDDED 634 bool "Enable DMI scanning" if EXPERT
635 ---help--- 635 ---help---
636 Enabled scanning of DMI to identify machine quirks. Say Y 636 Enabled scanning of DMI to identify machine quirks. Say Y
637 here unless you have verified that your setup is not 637 here unless you have verified that your setup is not
@@ -639,7 +639,7 @@ config DMI
639 BIOS code. 639 BIOS code.
640 640
641config GART_IOMMU 641config GART_IOMMU
642 bool "GART IOMMU support" if EMBEDDED 642 bool "GART IOMMU support" if EXPERT
643 default y 643 default y
644 select SWIOTLB 644 select SWIOTLB
645 depends on X86_64 && PCI && AMD_NB 645 depends on X86_64 && PCI && AMD_NB
@@ -889,7 +889,7 @@ config X86_THERMAL_VECTOR
889 depends on X86_MCE_INTEL 889 depends on X86_MCE_INTEL
890 890
891config VM86 891config VM86
892 bool "Enable VM86 support" if EMBEDDED 892 bool "Enable VM86 support" if EXPERT
893 default y 893 default y
894 depends on X86_32 894 depends on X86_32
895 ---help--- 895 ---help---
@@ -1073,7 +1073,7 @@ endchoice
1073 1073
1074choice 1074choice
1075 depends on EXPERIMENTAL 1075 depends on EXPERIMENTAL
1076 prompt "Memory split" if EMBEDDED 1076 prompt "Memory split" if EXPERT
1077 default VMSPLIT_3G 1077 default VMSPLIT_3G
1078 depends on X86_32 1078 depends on X86_32
1079 ---help--- 1079 ---help---
@@ -1135,7 +1135,7 @@ config ARCH_DMA_ADDR_T_64BIT
1135 def_bool X86_64 || HIGHMEM64G 1135 def_bool X86_64 || HIGHMEM64G
1136 1136
1137config DIRECT_GBPAGES 1137config DIRECT_GBPAGES
1138 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED 1138 bool "Enable 1GB pages for kernel pagetables" if EXPERT
1139 default y 1139 default y
1140 depends on X86_64 1140 depends on X86_64
1141 ---help--- 1141 ---help---
@@ -1369,7 +1369,7 @@ config MATH_EMULATION
1369 1369
1370config MTRR 1370config MTRR
1371 def_bool y 1371 def_bool y
1372 prompt "MTRR (Memory Type Range Register) support" if EMBEDDED 1372 prompt "MTRR (Memory Type Range Register) support" if EXPERT
1373 ---help--- 1373 ---help---
1374 On Intel P6 family processors (Pentium Pro, Pentium II and later) 1374 On Intel P6 family processors (Pentium Pro, Pentium II and later)
1375 the Memory Type Range Registers (MTRRs) may be used to control 1375 the Memory Type Range Registers (MTRRs) may be used to control
@@ -1435,7 +1435,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1435 1435
1436config X86_PAT 1436config X86_PAT
1437 def_bool y 1437 def_bool y
1438 prompt "x86 PAT support" if EMBEDDED 1438 prompt "x86 PAT support" if EXPERT
1439 depends on MTRR 1439 depends on MTRR
1440 ---help--- 1440 ---help---
1441 Use PAT attributes to setup page level cache control. 1441 Use PAT attributes to setup page level cache control.
@@ -1539,7 +1539,7 @@ config KEXEC_JUMP
1539 code in physical address mode via KEXEC 1539 code in physical address mode via KEXEC
1540 1540
1541config PHYSICAL_START 1541config PHYSICAL_START
1542 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 1542 hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
1543 default "0x1000000" 1543 default "0x1000000"
1544 ---help--- 1544 ---help---
1545 This gives the physical address where the kernel is loaded. 1545 This gives the physical address where the kernel is loaded.
@@ -1934,13 +1934,19 @@ config PCI_MMCONFIG
1934 depends on X86_64 && PCI && ACPI 1934 depends on X86_64 && PCI && ACPI
1935 1935
1936config PCI_CNB20LE_QUIRK 1936config PCI_CNB20LE_QUIRK
1937 bool "Read CNB20LE Host Bridge Windows" 1937 bool "Read CNB20LE Host Bridge Windows" if EXPERT
1938 depends on PCI 1938 default n
1939 depends on PCI && EXPERIMENTAL
1939 help 1940 help
1940 Read the PCI windows out of the CNB20LE host bridge. This allows 1941 Read the PCI windows out of the CNB20LE host bridge. This allows
1941 PCI hotplug to work on systems with the CNB20LE chipset which do 1942 PCI hotplug to work on systems with the CNB20LE chipset which do
1942 not have ACPI. 1943 not have ACPI.
1943 1944
1945 There's no public spec for this chipset, and this functionality
1946 is known to be incomplete.
1947
1948 You should say N unless you know you need this.
1949
1944config DMAR 1950config DMAR
1945 bool "Support for DMA Remapping Devices (EXPERIMENTAL)" 1951 bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
1946 depends on PCI_MSI && ACPI && EXPERIMENTAL 1952 depends on PCI_MSI && ACPI && EXPERIMENTAL
@@ -2062,13 +2068,14 @@ config OLPC
2062 bool "One Laptop Per Child support" 2068 bool "One Laptop Per Child support"
2063 select GPIOLIB 2069 select GPIOLIB
2064 select OLPC_OPENFIRMWARE 2070 select OLPC_OPENFIRMWARE
2071 depends on !X86_64 && !X86_PAE
2065 ---help--- 2072 ---help---
2066 Add support for detecting the unique features of the OLPC 2073 Add support for detecting the unique features of the OLPC
2067 XO hardware. 2074 XO hardware.
2068 2075
2069config OLPC_XO1 2076config OLPC_XO1
2070 tristate "OLPC XO-1 support" 2077 tristate "OLPC XO-1 support"
2071 depends on OLPC && PCI 2078 depends on OLPC && MFD_CS5535
2072 ---help--- 2079 ---help---
2073 Add support for non-essential features of the OLPC XO-1 laptop. 2080 Add support for non-essential features of the OLPC XO-1 laptop.
2074 2081
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 15588a0ef46..283c5a6a03a 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -424,7 +424,7 @@ config X86_DEBUGCTLMSR
424 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML 424 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML
425 425
426menuconfig PROCESSOR_SELECT 426menuconfig PROCESSOR_SELECT
427 bool "Supported processor vendors" if EMBEDDED 427 bool "Supported processor vendors" if EXPERT
428 ---help--- 428 ---help---
429 This lets you choose what x86 vendor support code your kernel 429 This lets you choose what x86 vendor support code your kernel
430 will include. 430 will include.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 45143bbcfe5..615e18810f4 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -31,7 +31,7 @@ config X86_VERBOSE_BOOTUP
31 see errors. Disable this if you want silent bootup. 31 see errors. Disable this if you want silent bootup.
32 32
33config EARLY_PRINTK 33config EARLY_PRINTK
34 bool "Early printk" if EMBEDDED 34 bool "Early printk" if EXPERT
35 default y 35 default y
36 ---help--- 36 ---help---
37 Write kernel log output directly into the VGA buffer or to a serial 37 Write kernel log output directly into the VGA buffer or to a serial
@@ -138,7 +138,7 @@ config DEBUG_NX_TEST
138 138
139config DOUBLEFAULT 139config DOUBLEFAULT
140 default y 140 default y
141 bool "Enable doublefault exception handler" if EMBEDDED 141 bool "Enable doublefault exception handler" if EXPERT
142 depends on X86_32 142 depends on X86_32
143 ---help--- 143 ---help---
144 This option allows trapping of rare doublefault exceptions that 144 This option allows trapping of rare doublefault exceptions that
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 63e35ec9075..62f084478f7 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -1,48 +1,8 @@
1#ifndef _ASM_X86_CACHEFLUSH_H 1#ifndef _ASM_X86_CACHEFLUSH_H
2#define _ASM_X86_CACHEFLUSH_H 2#define _ASM_X86_CACHEFLUSH_H
3 3
4/* Keep includes the same across arches. */
5#include <linux/mm.h>
6
7/* Caches aren't brain-dead on the intel. */ 4/* Caches aren't brain-dead on the intel. */
8static inline void flush_cache_all(void) { } 5#include <asm-generic/cacheflush.h>
9static inline void flush_cache_mm(struct mm_struct *mm) { }
10static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
11static inline void flush_cache_range(struct vm_area_struct *vma,
12 unsigned long start, unsigned long end) { }
13static inline void flush_cache_page(struct vm_area_struct *vma,
14 unsigned long vmaddr, unsigned long pfn) { }
15#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
16static inline void flush_dcache_page(struct page *page) { }
17static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
18static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
19static inline void flush_icache_range(unsigned long start,
20 unsigned long end) { }
21static inline void flush_icache_page(struct vm_area_struct *vma,
22 struct page *page) { }
23static inline void flush_icache_user_range(struct vm_area_struct *vma,
24 struct page *page,
25 unsigned long addr,
26 unsigned long len) { }
27static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
28static inline void flush_cache_vunmap(unsigned long start,
29 unsigned long end) { }
30
31static inline void copy_to_user_page(struct vm_area_struct *vma,
32 struct page *page, unsigned long vaddr,
33 void *dst, const void *src,
34 unsigned long len)
35{
36 memcpy(dst, src, len);
37}
38
39static inline void copy_from_user_page(struct vm_area_struct *vma,
40 struct page *page, unsigned long vaddr,
41 void *dst, const void *src,
42 unsigned long len)
43{
44 memcpy(dst, src, len);
45}
46 6
47#ifdef CONFIG_X86_PAT 7#ifdef CONFIG_X86_PAT
48/* 8/*
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index 4fab24de26b..6e6e7558e70 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -32,5 +32,6 @@ extern void arch_unregister_cpu(int);
32 32
33DECLARE_PER_CPU(int, cpu_state); 33DECLARE_PER_CPU(int, cpu_state);
34 34
35int __cpuinit mwait_usable(const struct cpuinfo_x86 *);
35 36
36#endif /* _ASM_X86_CPU_H */ 37#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index f52d42e8058..574dbc22893 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -14,7 +14,7 @@
14 do { \ 14 do { \
15 asm goto("1:" \ 15 asm goto("1:" \
16 JUMP_LABEL_INITIAL_NOP \ 16 JUMP_LABEL_INITIAL_NOP \
17 ".pushsection __jump_table, \"a\" \n\t"\ 17 ".pushsection __jump_table, \"aw\" \n\t"\
18 _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \ 18 _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \
19 ".popsection \n\t" \ 19 ".popsection \n\t" \
20 : : "i" (key) : : label); \ 20 : : "i" (key) : : label); \
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa75f21a9fb..ffd7f8d2918 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -822,6 +822,7 @@ extern bool kvm_rebooting;
822#define KVM_ARCH_WANT_MMU_NOTIFIER 822#define KVM_ARCH_WANT_MMU_NOTIFIER
823int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 823int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
824int kvm_age_hva(struct kvm *kvm, unsigned long hva); 824int kvm_age_hva(struct kvm *kvm, unsigned long hva);
825int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
825void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 826void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
826int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 827int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
827int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 828int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 4a2d4e0c18d..8b5393ec108 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -36,8 +36,6 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
36 unsigned cpu = smp_processor_id(); 36 unsigned cpu = smp_processor_id();
37 37
38 if (likely(prev != next)) { 38 if (likely(prev != next)) {
39 /* stop flush ipis for the previous mm */
40 cpumask_clear_cpu(cpu, mm_cpumask(prev));
41#ifdef CONFIG_SMP 39#ifdef CONFIG_SMP
42 percpu_write(cpu_tlbstate.state, TLBSTATE_OK); 40 percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
43 percpu_write(cpu_tlbstate.active_mm, next); 41 percpu_write(cpu_tlbstate.active_mm, next);
@@ -47,6 +45,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
47 /* Re-load page tables */ 45 /* Re-load page tables */
48 load_cr3(next->pgd); 46 load_cr3(next->pgd);
49 47
48 /* stop flush ipis for the previous mm */
49 cpumask_clear_cpu(cpu, mm_cpumask(prev));
50
50 /* 51 /*
51 * load the LDT, if the LDT is different: 52 * load the LDT, if the LDT is different:
52 */ 53 */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index a37229011b5..b0ef2b449a9 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_NUMA_32_H 1#ifndef _ASM_X86_NUMA_32_H
2#define _ASM_X86_NUMA_32_H 2#define _ASM_X86_NUMA_32_H
3 3
4extern int numa_off;
5
4extern int pxm_to_nid(int pxm); 6extern int pxm_to_nid(int pxm);
5extern void numa_remove_cpu(int cpu); 7extern void numa_remove_cpu(int cpu);
6 8
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 5ae87285a50..0493be39607 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -40,6 +40,7 @@ extern void __cpuinit numa_remove_cpu(int cpu);
40#ifdef CONFIG_NUMA_EMU 40#ifdef CONFIG_NUMA_EMU
41#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) 41#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
42#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) 42#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
43void numa_emu_cmdline(char *);
43#endif /* CONFIG_NUMA_EMU */ 44#endif /* CONFIG_NUMA_EMU */
44#else 45#else
45static inline void init_cpu_to_node(void) { } 46static inline void init_cpu_to_node(void) { }
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 7709c12431b..ebbc4d8ab17 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -435,6 +435,11 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr,
435{ 435{
436 PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); 436 PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
437} 437}
438static inline void pmd_update(struct mm_struct *mm, unsigned long addr,
439 pmd_t *pmdp)
440{
441 PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp);
442}
438 443
439static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, 444static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
440 pte_t *ptep) 445 pte_t *ptep)
@@ -442,6 +447,12 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
442 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); 447 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
443} 448}
444 449
450static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr,
451 pmd_t *pmdp)
452{
453 PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp);
454}
455
445static inline pte_t __pte(pteval_t val) 456static inline pte_t __pte(pteval_t val)
446{ 457{
447 pteval_t ret; 458 pteval_t ret;
@@ -543,6 +554,19 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
543 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); 554 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
544} 555}
545 556
557#ifdef CONFIG_TRANSPARENT_HUGEPAGE
558static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
559 pmd_t *pmdp, pmd_t pmd)
560{
561 if (sizeof(pmdval_t) > sizeof(long))
562 /* 5 arg words */
563 pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd);
564 else
565 PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp,
566 native_pmd_val(pmd));
567}
568#endif
569
546static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) 570static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
547{ 571{
548 pmdval_t val = native_pmd_val(pmd); 572 pmdval_t val = native_pmd_val(pmd);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b82bac97525..82885099c86 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -265,10 +265,16 @@ struct pv_mmu_ops {
265 void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, 265 void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
266 pte_t *ptep, pte_t pteval); 266 pte_t *ptep, pte_t pteval);
267 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); 267 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
268 void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
269 pmd_t *pmdp, pmd_t pmdval);
268 void (*pte_update)(struct mm_struct *mm, unsigned long addr, 270 void (*pte_update)(struct mm_struct *mm, unsigned long addr,
269 pte_t *ptep); 271 pte_t *ptep);
270 void (*pte_update_defer)(struct mm_struct *mm, 272 void (*pte_update_defer)(struct mm_struct *mm,
271 unsigned long addr, pte_t *ptep); 273 unsigned long addr, pte_t *ptep);
274 void (*pmd_update)(struct mm_struct *mm, unsigned long addr,
275 pmd_t *pmdp);
276 void (*pmd_update_defer)(struct mm_struct *mm,
277 unsigned long addr, pmd_t *pmdp);
272 278
273 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, 279 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
274 pte_t *ptep); 280 pte_t *ptep);
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 8ee45167e81..7e172955ee5 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -273,34 +273,34 @@ do { \
273 typeof(var) pxo_new__ = (nval); \ 273 typeof(var) pxo_new__ = (nval); \
274 switch (sizeof(var)) { \ 274 switch (sizeof(var)) { \
275 case 1: \ 275 case 1: \
276 asm("\n1:mov "__percpu_arg(1)",%%al" \ 276 asm("\n\tmov "__percpu_arg(1)",%%al" \
277 "\n\tcmpxchgb %2, "__percpu_arg(1) \ 277 "\n1:\tcmpxchgb %2, "__percpu_arg(1) \
278 "\n\tjnz 1b" \ 278 "\n\tjnz 1b" \
279 : "=a" (pxo_ret__), "+m" (var) \ 279 : "=&a" (pxo_ret__), "+m" (var) \
280 : "q" (pxo_new__) \ 280 : "q" (pxo_new__) \
281 : "memory"); \ 281 : "memory"); \
282 break; \ 282 break; \
283 case 2: \ 283 case 2: \
284 asm("\n1:mov "__percpu_arg(1)",%%ax" \ 284 asm("\n\tmov "__percpu_arg(1)",%%ax" \
285 "\n\tcmpxchgw %2, "__percpu_arg(1) \ 285 "\n1:\tcmpxchgw %2, "__percpu_arg(1) \
286 "\n\tjnz 1b" \ 286 "\n\tjnz 1b" \
287 : "=a" (pxo_ret__), "+m" (var) \ 287 : "=&a" (pxo_ret__), "+m" (var) \
288 : "r" (pxo_new__) \ 288 : "r" (pxo_new__) \
289 : "memory"); \ 289 : "memory"); \
290 break; \ 290 break; \
291 case 4: \ 291 case 4: \
292 asm("\n1:mov "__percpu_arg(1)",%%eax" \ 292 asm("\n\tmov "__percpu_arg(1)",%%eax" \
293 "\n\tcmpxchgl %2, "__percpu_arg(1) \ 293 "\n1:\tcmpxchgl %2, "__percpu_arg(1) \
294 "\n\tjnz 1b" \ 294 "\n\tjnz 1b" \
295 : "=a" (pxo_ret__), "+m" (var) \ 295 : "=&a" (pxo_ret__), "+m" (var) \
296 : "r" (pxo_new__) \ 296 : "r" (pxo_new__) \
297 : "memory"); \ 297 : "memory"); \
298 break; \ 298 break; \
299 case 8: \ 299 case 8: \
300 asm("\n1:mov "__percpu_arg(1)",%%rax" \ 300 asm("\n\tmov "__percpu_arg(1)",%%rax" \
301 "\n\tcmpxchgq %2, "__percpu_arg(1) \ 301 "\n1:\tcmpxchgq %2, "__percpu_arg(1) \
302 "\n\tjnz 1b" \ 302 "\n\tjnz 1b" \
303 : "=a" (pxo_ret__), "+m" (var) \ 303 : "=&a" (pxo_ret__), "+m" (var) \
304 : "r" (pxo_new__) \ 304 : "r" (pxo_new__) \
305 : "memory"); \ 305 : "memory"); \
306 break; \ 306 break; \
@@ -414,8 +414,6 @@ do { \
414#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) 414#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
415#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) 415#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
416#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) 416#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
417#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
418#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
419 417
420#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val) 418#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
421#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val) 419#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
@@ -432,8 +430,6 @@ do { \
432#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) 430#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
433#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) 431#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
434#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) 432#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
435#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
436#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
437 433
438#ifndef CONFIG_M386 434#ifndef CONFIG_M386
439#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) 435#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
@@ -475,11 +471,15 @@ do { \
475#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 471#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
476#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 472#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
477#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) 473#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
474#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
475#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
478 476
479#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val) 477#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
480#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 478#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
481#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 479#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
482#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 480#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
481#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
482#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
483#endif 483#endif
484 484
485/* This is not atomic against other CPUs -- CPU preemption needs to be off */ 485/* This is not atomic against other CPUs -- CPU preemption needs to be off */
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 2334982b339..98391db840c 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
46#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) 46#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
47#endif 47#endif
48 48
49#ifdef CONFIG_SMP
50static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
51{
52 return __pmd(xchg((pmdval_t *)xp, 0));
53}
54#else
55#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
56#endif
57
49/* 58/*
50 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, 59 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
51 * split up the 29 bits of offset into this range: 60 * split up the 29 bits of offset into this range:
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 177b0165ea0..94b979d1b58 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
104#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) 104#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
105#endif 105#endif
106 106
107#ifdef CONFIG_SMP
108union split_pmd {
109 struct {
110 u32 pmd_low;
111 u32 pmd_high;
112 };
113 pmd_t pmd;
114};
115static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
116{
117 union split_pmd res, *orig = (union split_pmd *)pmdp;
118
119 /* xchg acts as a barrier before setting of the high bits */
120 res.pmd_low = xchg(&orig->pmd_low, 0);
121 res.pmd_high = orig->pmd_high;
122 orig->pmd_high = 0;
123
124 return res.pmd;
125}
126#else
127#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
128#endif
129
107/* 130/*
108 * Bits 0, 6 and 7 are taken in the low part of the pte, 131 * Bits 0, 6 and 7 are taken in the low part of the pte,
109 * put the 32 bits of offset into the high part. 132 * put the 32 bits of offset into the high part.
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index ada823a13c7..18601c86fab 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -35,6 +35,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
35#else /* !CONFIG_PARAVIRT */ 35#else /* !CONFIG_PARAVIRT */
36#define set_pte(ptep, pte) native_set_pte(ptep, pte) 36#define set_pte(ptep, pte) native_set_pte(ptep, pte)
37#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) 37#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
38#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd)
38 39
39#define set_pte_atomic(ptep, pte) \ 40#define set_pte_atomic(ptep, pte) \
40 native_set_pte_atomic(ptep, pte) 41 native_set_pte_atomic(ptep, pte)
@@ -59,6 +60,8 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
59 60
60#define pte_update(mm, addr, ptep) do { } while (0) 61#define pte_update(mm, addr, ptep) do { } while (0)
61#define pte_update_defer(mm, addr, ptep) do { } while (0) 62#define pte_update_defer(mm, addr, ptep) do { } while (0)
63#define pmd_update(mm, addr, ptep) do { } while (0)
64#define pmd_update_defer(mm, addr, ptep) do { } while (0)
62 65
63#define pgd_val(x) native_pgd_val(x) 66#define pgd_val(x) native_pgd_val(x)
64#define __pgd(x) native_make_pgd(x) 67#define __pgd(x) native_make_pgd(x)
@@ -94,6 +97,11 @@ static inline int pte_young(pte_t pte)
94 return pte_flags(pte) & _PAGE_ACCESSED; 97 return pte_flags(pte) & _PAGE_ACCESSED;
95} 98}
96 99
100static inline int pmd_young(pmd_t pmd)
101{
102 return pmd_flags(pmd) & _PAGE_ACCESSED;
103}
104
97static inline int pte_write(pte_t pte) 105static inline int pte_write(pte_t pte)
98{ 106{
99 return pte_flags(pte) & _PAGE_RW; 107 return pte_flags(pte) & _PAGE_RW;
@@ -142,6 +150,23 @@ static inline int pmd_large(pmd_t pte)
142 (_PAGE_PSE | _PAGE_PRESENT); 150 (_PAGE_PSE | _PAGE_PRESENT);
143} 151}
144 152
153#ifdef CONFIG_TRANSPARENT_HUGEPAGE
154static inline int pmd_trans_splitting(pmd_t pmd)
155{
156 return pmd_val(pmd) & _PAGE_SPLITTING;
157}
158
159static inline int pmd_trans_huge(pmd_t pmd)
160{
161 return pmd_val(pmd) & _PAGE_PSE;
162}
163
164static inline int has_transparent_hugepage(void)
165{
166 return cpu_has_pse;
167}
168#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
169
145static inline pte_t pte_set_flags(pte_t pte, pteval_t set) 170static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
146{ 171{
147 pteval_t v = native_pte_val(pte); 172 pteval_t v = native_pte_val(pte);
@@ -216,6 +241,55 @@ static inline pte_t pte_mkspecial(pte_t pte)
216 return pte_set_flags(pte, _PAGE_SPECIAL); 241 return pte_set_flags(pte, _PAGE_SPECIAL);
217} 242}
218 243
244static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
245{
246 pmdval_t v = native_pmd_val(pmd);
247
248 return __pmd(v | set);
249}
250
251static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
252{
253 pmdval_t v = native_pmd_val(pmd);
254
255 return __pmd(v & ~clear);
256}
257
258static inline pmd_t pmd_mkold(pmd_t pmd)
259{
260 return pmd_clear_flags(pmd, _PAGE_ACCESSED);
261}
262
263static inline pmd_t pmd_wrprotect(pmd_t pmd)
264{
265 return pmd_clear_flags(pmd, _PAGE_RW);
266}
267
268static inline pmd_t pmd_mkdirty(pmd_t pmd)
269{
270 return pmd_set_flags(pmd, _PAGE_DIRTY);
271}
272
273static inline pmd_t pmd_mkhuge(pmd_t pmd)
274{
275 return pmd_set_flags(pmd, _PAGE_PSE);
276}
277
278static inline pmd_t pmd_mkyoung(pmd_t pmd)
279{
280 return pmd_set_flags(pmd, _PAGE_ACCESSED);
281}
282
283static inline pmd_t pmd_mkwrite(pmd_t pmd)
284{
285 return pmd_set_flags(pmd, _PAGE_RW);
286}
287
288static inline pmd_t pmd_mknotpresent(pmd_t pmd)
289{
290 return pmd_clear_flags(pmd, _PAGE_PRESENT);
291}
292
219/* 293/*
220 * Mask out unsupported bits in a present pgprot. Non-present pgprots 294 * Mask out unsupported bits in a present pgprot. Non-present pgprots
221 * can use those bits for other purposes, so leave them be. 295 * can use those bits for other purposes, so leave them be.
@@ -256,6 +330,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
256 return __pte(val); 330 return __pte(val);
257} 331}
258 332
333static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
334{
335 pmdval_t val = pmd_val(pmd);
336
337 val &= _HPAGE_CHG_MASK;
338 val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
339
340 return __pmd(val);
341}
342
259/* mprotect needs to preserve PAT bits when updating vm_page_prot */ 343/* mprotect needs to preserve PAT bits when updating vm_page_prot */
260#define pgprot_modify pgprot_modify 344#define pgprot_modify pgprot_modify
261static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) 345static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
@@ -350,7 +434,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
350 * Currently stuck as a macro due to indirect forward reference to 434 * Currently stuck as a macro due to indirect forward reference to
351 * linux/mmzone.h's __section_mem_map_addr() definition: 435 * linux/mmzone.h's __section_mem_map_addr() definition:
352 */ 436 */
353#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) 437#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
354 438
355/* 439/*
356 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] 440 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -524,12 +608,26 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
524 return res; 608 return res;
525} 609}
526 610
611static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
612{
613 pmd_t res = *pmdp;
614
615 native_pmd_clear(pmdp);
616 return res;
617}
618
527static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, 619static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
528 pte_t *ptep , pte_t pte) 620 pte_t *ptep , pte_t pte)
529{ 621{
530 native_set_pte(ptep, pte); 622 native_set_pte(ptep, pte);
531} 623}
532 624
625static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
626 pmd_t *pmdp , pmd_t pmd)
627{
628 native_set_pmd(pmdp, pmd);
629}
630
533#ifndef CONFIG_PARAVIRT 631#ifndef CONFIG_PARAVIRT
534/* 632/*
535 * Rules for using pte_update - it must be called after any PTE update which 633 * Rules for using pte_update - it must be called after any PTE update which
@@ -607,6 +705,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
607 705
608#define flush_tlb_fix_spurious_fault(vma, address) 706#define flush_tlb_fix_spurious_fault(vma, address)
609 707
708#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
709
710#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
711extern int pmdp_set_access_flags(struct vm_area_struct *vma,
712 unsigned long address, pmd_t *pmdp,
713 pmd_t entry, int dirty);
714
715#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
716extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
717 unsigned long addr, pmd_t *pmdp);
718
719#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
720extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
721 unsigned long address, pmd_t *pmdp);
722
723
724#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
725extern void pmdp_splitting_flush(struct vm_area_struct *vma,
726 unsigned long addr, pmd_t *pmdp);
727
728#define __HAVE_ARCH_PMD_WRITE
729static inline int pmd_write(pmd_t pmd)
730{
731 return pmd_flags(pmd) & _PAGE_RW;
732}
733
734#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
735static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
736 pmd_t *pmdp)
737{
738 pmd_t pmd = native_pmdp_get_and_clear(pmdp);
739 pmd_update(mm, addr, pmdp);
740 return pmd;
741}
742
743#define __HAVE_ARCH_PMDP_SET_WRPROTECT
744static inline void pmdp_set_wrprotect(struct mm_struct *mm,
745 unsigned long addr, pmd_t *pmdp)
746{
747 clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
748 pmd_update(mm, addr, pmdp);
749}
750
610/* 751/*
611 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); 752 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
612 * 753 *
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index f86da20347f..975f709e09a 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -59,6 +59,16 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
59 native_set_pte(ptep, pte); 59 native_set_pte(ptep, pte);
60} 60}
61 61
62static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
63{
64 *pmdp = pmd;
65}
66
67static inline void native_pmd_clear(pmd_t *pmd)
68{
69 native_set_pmd(pmd, native_make_pmd(0));
70}
71
62static inline pte_t native_ptep_get_and_clear(pte_t *xp) 72static inline pte_t native_ptep_get_and_clear(pte_t *xp)
63{ 73{
64#ifdef CONFIG_SMP 74#ifdef CONFIG_SMP
@@ -72,14 +82,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
72#endif 82#endif
73} 83}
74 84
75static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) 85static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
76{ 86{
77 *pmdp = pmd; 87#ifdef CONFIG_SMP
78} 88 return native_make_pmd(xchg(&xp->pmd, 0));
79 89#else
80static inline void native_pmd_clear(pmd_t *pmd) 90 /* native_local_pmdp_get_and_clear,
81{ 91 but duplicated because of cyclic dependency */
82 native_set_pmd(pmd, native_make_pmd(0)); 92 pmd_t ret = *xp;
93 native_pmd_clear(xp);
94 return ret;
95#endif
83} 96}
84 97
85static inline void native_set_pud(pud_t *pudp, pud_t pud) 98static inline void native_set_pud(pud_t *pudp, pud_t pud)
@@ -168,6 +181,7 @@ extern void cleanup_highmap(void);
168#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) 181#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
169 182
170#define __HAVE_ARCH_PTE_SAME 183#define __HAVE_ARCH_PTE_SAME
184
171#endif /* !__ASSEMBLY__ */ 185#endif /* !__ASSEMBLY__ */
172 186
173#endif /* _ASM_X86_PGTABLE_64_H */ 187#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d1f4a760be2..7db7723d1f3 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -22,6 +22,7 @@
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
25#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
25#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 26#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
26 27
27/* If _PAGE_BIT_PRESENT is clear, we use these: */ 28/* If _PAGE_BIT_PRESENT is clear, we use these: */
@@ -45,6 +46,7 @@
45#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 46#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
46#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 47#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
47#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 48#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
49#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
48#define __HAVE_ARCH_PTE_SPECIAL 50#define __HAVE_ARCH_PTE_SPECIAL
49 51
50#ifdef CONFIG_KMEMCHECK 52#ifdef CONFIG_KMEMCHECK
@@ -70,6 +72,7 @@
70/* Set of bits not changed in pte_modify */ 72/* Set of bits not changed in pte_modify */
71#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ 73#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
72 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) 74 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
75#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
73 76
74#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) 77#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
75#define _PAGE_CACHE_WB (0) 78#define _PAGE_CACHE_WB (0)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 53fd1d5a1fe..45636cefa18 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -761,10 +761,11 @@ extern void select_idle_routine(const struct cpuinfo_x86 *c);
761extern void init_c1e_mask(void); 761extern void init_c1e_mask(void);
762 762
763extern unsigned long boot_option_idle_override; 763extern unsigned long boot_option_idle_override;
764extern unsigned long idle_halt;
765extern unsigned long idle_nomwait;
766extern bool c1e_detected; 764extern bool c1e_detected;
767 765
766enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
767 IDLE_POLL, IDLE_FORCE_MWAIT};
768
768extern void enable_sep_cpu(void); 769extern void enable_sep_cpu(void);
769extern int sysenter_setup(void); 770extern int sysenter_setup(void);
770 771
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4c2f63c7fc1..1f469513677 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -40,10 +40,7 @@ DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
40DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); 40DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
41 41
42/* Static state in head.S used to set up a CPU */ 42/* Static state in head.S used to set up a CPU */
43extern struct { 43extern unsigned long stack_start; /* Initial stack pointer address */
44 void *sp;
45 unsigned short ss;
46} stack_start;
47 44
48struct smp_ops { 45struct smp_ops {
49 void (*smp_prepare_boot_cpu)(void); 46 void (*smp_prepare_boot_cpu)(void);
diff --git a/arch/x86/include/asm/system_64.h b/arch/x86/include/asm/system_64.h
deleted file mode 100644
index 1159e091ad0..00000000000
--- a/arch/x86/include/asm/system_64.h
+++ /dev/null
@@ -1,22 +0,0 @@
1#ifndef _ASM_X86_SYSTEM_64_H
2#define _ASM_X86_SYSTEM_64_H
3
4#include <asm/segment.h>
5#include <asm/cmpxchg.h>
6
7
8static inline unsigned long read_cr8(void)
9{
10 unsigned long cr8;
11 asm volatile("movq %%cr8,%0" : "=r" (cr8));
12 return cr8;
13}
14
15static inline void write_cr8(unsigned long val)
16{
17 asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
18}
19
20#include <linux/irqflags.h>
21
22#endif /* _ASM_X86_SYSTEM_64_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 8760cc60a21..f25bdf238a3 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -42,6 +42,11 @@ extern unsigned int machine_to_phys_order;
42extern unsigned long get_phys_to_machine(unsigned long pfn); 42extern unsigned long get_phys_to_machine(unsigned long pfn);
43extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); 43extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
44 44
45extern int m2p_add_override(unsigned long mfn, struct page *page);
46extern int m2p_remove_override(struct page *page);
47extern struct page *m2p_find_override(unsigned long mfn);
48extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
49
45static inline unsigned long pfn_to_mfn(unsigned long pfn) 50static inline unsigned long pfn_to_mfn(unsigned long pfn)
46{ 51{
47 unsigned long mfn; 52 unsigned long mfn;
@@ -72,9 +77,6 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
72 if (xen_feature(XENFEAT_auto_translated_physmap)) 77 if (xen_feature(XENFEAT_auto_translated_physmap))
73 return mfn; 78 return mfn;
74 79
75 if (unlikely((mfn >> machine_to_phys_order) != 0))
76 return ~0;
77
78 pfn = 0; 80 pfn = 0;
79 /* 81 /*
80 * The array access can fail (e.g., device space beyond end of RAM). 82 * The array access can fail (e.g., device space beyond end of RAM).
@@ -83,6 +85,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
83 */ 85 */
84 __get_user(pfn, &machine_to_phys_mapping[mfn]); 86 __get_user(pfn, &machine_to_phys_mapping[mfn]);
85 87
88 /*
89 * If this appears to be a foreign mfn (because the pfn
90 * doesn't map back to the mfn), then check the local override
91 * table to see if there's a better pfn to use.
92 */
93 if (get_phys_to_machine(pfn) != mfn)
94 pfn = m2p_find_override_pfn(mfn, pfn);
95
86 return pfn; 96 return pfn;
87} 97}
88 98
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ec881c6bfee..b3a71137983 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -509,6 +509,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
509 509
510 return 0; 510 return 0;
511} 511}
512EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
512 513
513int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) 514int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
514{ 515{
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 69fd72aa559..68d1537b8c8 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -12,10 +12,8 @@
12#include <linux/cpumask.h> 12#include <linux/cpumask.h>
13#include <asm/segment.h> 13#include <asm/segment.h>
14#include <asm/desc.h> 14#include <asm/desc.h>
15
16#ifdef CONFIG_X86_32
17#include <asm/pgtable.h> 15#include <asm/pgtable.h>
18#endif 16#include <asm/cacheflush.h>
19 17
20#include "realmode/wakeup.h" 18#include "realmode/wakeup.h"
21#include "sleep.h" 19#include "sleep.h"
@@ -100,7 +98,7 @@ int acpi_save_state_mem(void)
100#else /* CONFIG_64BIT */ 98#else /* CONFIG_64BIT */
101 header->trampoline_segment = setup_trampoline() >> 4; 99 header->trampoline_segment = setup_trampoline() >> 4;
102#ifdef CONFIG_SMP 100#ifdef CONFIG_SMP
103 stack_start.sp = temp_stack + sizeof(temp_stack); 101 stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
104 early_gdt_descr.address = 102 early_gdt_descr.address =
105 (unsigned long)get_cpu_gdt_table(smp_processor_id()); 103 (unsigned long)get_cpu_gdt_table(smp_processor_id());
106 initial_gs = per_cpu_offset(smp_processor_id()); 104 initial_gs = per_cpu_offset(smp_processor_id());
@@ -149,6 +147,15 @@ void __init acpi_reserve_wakeup_memory(void)
149 memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); 147 memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
150} 148}
151 149
150int __init acpi_configure_wakeup_memory(void)
151{
152 if (acpi_realmode)
153 set_memory_x(acpi_realmode, WAKEUP_SIZE >> PAGE_SHIFT);
154
155 return 0;
156}
157arch_initcall(acpi_configure_wakeup_memory);
158
152 159
153static int __init acpi_sleep_setup(char *str) 160static int __init acpi_sleep_setup(char *str)
154{ 161{
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 7c9ab59653e..51ef31a89be 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -313,14 +313,16 @@ static void apbt_setup_irq(struct apbt_dev *adev)
313 if (adev->irq == 0) 313 if (adev->irq == 0)
314 return; 314 return;
315 315
316 irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
317 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
318 /* APB timer irqs are set up as mp_irqs, timer is edge type */
319 __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
320
316 if (system_state == SYSTEM_BOOTING) { 321 if (system_state == SYSTEM_BOOTING) {
317 irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
318 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
319 /* APB timer irqs are set up as mp_irqs, timer is edge type */
320 __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
321 if (request_irq(adev->irq, apbt_interrupt_handler, 322 if (request_irq(adev->irq, apbt_interrupt_handler,
322 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, 323 IRQF_TIMER | IRQF_DISABLED |
323 adev->name, adev)) { 324 IRQF_NOBALANCING,
325 adev->name, adev)) {
324 printk(KERN_ERR "Failed request IRQ for APBT%d\n", 326 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
325 adev->num); 327 adev->num);
326 } 328 }
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 7283e98deaa..ec2c19a7b8e 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -45,6 +45,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
45 { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ 45 { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
46 { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ 46 { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
47 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ 47 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
48 { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */
48 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ 49 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
49 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 50 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
50 { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ 51 { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
@@ -66,6 +67,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
66 { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ 67 { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
67 { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ 68 { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
68 { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ 69 { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
70 { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */
69 { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ 71 { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
70 { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ 72 { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
71 { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ 73 { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
@@ -87,6 +89,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
87 { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ 89 { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
88 { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ 90 { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
89 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ 91 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
92 { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */
90 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ 93 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
91 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ 94 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
92 { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ 95 { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index e12246ff5aa..6f8c5e9da97 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -59,6 +59,7 @@ struct thermal_state {
59 59
60/* Callback to handle core threshold interrupts */ 60/* Callback to handle core threshold interrupts */
61int (*platform_thermal_notify)(__u64 msr_val); 61int (*platform_thermal_notify)(__u64 msr_val);
62EXPORT_SYMBOL(platform_thermal_notify);
62 63
63static DEFINE_PER_CPU(struct thermal_state, thermal_state); 64static DEFINE_PER_CPU(struct thermal_state, thermal_state);
64 65
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 01c0f3ee6cc..bebabec5b44 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -793,13 +793,21 @@ void set_mtrr_aps_delayed_init(void)
793} 793}
794 794
795/* 795/*
796 * MTRR initialization for all AP's 796 * Delayed MTRR initialization for all AP's
797 */ 797 */
798void mtrr_aps_init(void) 798void mtrr_aps_init(void)
799{ 799{
800 if (!use_intel()) 800 if (!use_intel())
801 return; 801 return;
802 802
803 /*
804 * Check if someone has requested the delay of AP MTRR initialization,
805 * by doing set_mtrr_aps_delayed_init(), prior to this point. If not,
806 * then we are done.
807 */
808 if (!mtrr_aps_delayed_init)
809 return;
810
803 set_mtrr(~0U, 0, 0, 0); 811 set_mtrr(~0U, 0, 0, 0);
804 mtrr_aps_delayed_init = false; 812 mtrr_aps_delayed_init = false;
805} 813}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index e56b9bfbabd..f7a0993c1e7 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -682,7 +682,7 @@ static int p4_validate_raw_event(struct perf_event *event)
682 * if an event is shared accross the logical threads 682 * if an event is shared accross the logical threads
683 * the user needs special permissions to be able to use it 683 * the user needs special permissions to be able to use it
684 */ 684 */
685 if (p4_event_bind_map[v].shared) { 685 if (p4_ht_active() && p4_event_bind_map[v].shared) {
686 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 686 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
687 return -EACCES; 687 return -EACCES;
688 } 688 }
@@ -727,7 +727,8 @@ static int p4_hw_config(struct perf_event *event)
727 event->hw.config = p4_set_ht_bit(event->hw.config); 727 event->hw.config = p4_set_ht_bit(event->hw.config);
728 728
729 if (event->attr.type == PERF_TYPE_RAW) { 729 if (event->attr.type == PERF_TYPE_RAW) {
730 730 struct p4_event_bind *bind;
731 unsigned int esel;
731 /* 732 /*
732 * Clear bits we reserve to be managed by kernel itself 733 * Clear bits we reserve to be managed by kernel itself
733 * and never allowed from a user space 734 * and never allowed from a user space
@@ -743,6 +744,13 @@ static int p4_hw_config(struct perf_event *event)
743 * bits since we keep additional info here (for cache events and etc) 744 * bits since we keep additional info here (for cache events and etc)
744 */ 745 */
745 event->hw.config |= event->attr.config; 746 event->hw.config |= event->attr.config;
747 bind = p4_config_get_bind(event->attr.config);
748 if (!bind) {
749 rc = -EINVAL;
750 goto out;
751 }
752 esel = P4_OPCODE_ESEL(bind->opcode);
753 event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
746 } 754 }
747 755
748 rc = x86_setup_perfctr(event); 756 rc = x86_setup_perfctr(event);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index d6fb146c0d8..df20723a6a1 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -234,6 +234,7 @@ unsigned __kprobes long oops_begin(void)
234 bust_spinlocks(1); 234 bust_spinlocks(1);
235 return flags; 235 return flags;
236} 236}
237EXPORT_SYMBOL_GPL(oops_begin);
237 238
238void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) 239void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
239{ 240{
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 64101335de1..a6b6fcf7f0a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -149,13 +149,13 @@ void dump_trace(struct task_struct *task,
149 unsigned used = 0; 149 unsigned used = 0;
150 struct thread_info *tinfo; 150 struct thread_info *tinfo;
151 int graph = 0; 151 int graph = 0;
152 unsigned long dummy;
152 unsigned long bp; 153 unsigned long bp;
153 154
154 if (!task) 155 if (!task)
155 task = current; 156 task = current;
156 157
157 if (!stack) { 158 if (!stack) {
158 unsigned long dummy;
159 stack = &dummy; 159 stack = &dummy;
160 if (task && task != current) 160 if (task && task != current)
161 stack = (unsigned long *)task->thread.sp; 161 stack = (unsigned long *)task->thread.sp;
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0c2b7ef7a34..294f26da0c0 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -14,6 +14,7 @@
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/pfn.h> 15#include <linux/pfn.h>
16#include <linux/suspend.h> 16#include <linux/suspend.h>
17#include <linux/acpi.h>
17#include <linux/firmware-map.h> 18#include <linux/firmware-map.h>
18#include <linux/memblock.h> 19#include <linux/memblock.h>
19 20
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fc293dc8dc3..767d6c43de3 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -85,6 +85,8 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
85 */ 85 */
86__HEAD 86__HEAD
87ENTRY(startup_32) 87ENTRY(startup_32)
88 movl pa(stack_start),%ecx
89
88 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 90 /* test KEEP_SEGMENTS flag to see if the bootloader is asking
89 us to not reload segments */ 91 us to not reload segments */
90 testb $(1<<6), BP_loadflags(%esi) 92 testb $(1<<6), BP_loadflags(%esi)
@@ -99,7 +101,9 @@ ENTRY(startup_32)
99 movl %eax,%es 101 movl %eax,%es
100 movl %eax,%fs 102 movl %eax,%fs
101 movl %eax,%gs 103 movl %eax,%gs
104 movl %eax,%ss
1022: 1052:
106 leal -__PAGE_OFFSET(%ecx),%esp
103 107
104/* 108/*
105 * Clear BSS first so that there are no surprises... 109 * Clear BSS first so that there are no surprises...
@@ -145,8 +149,6 @@ ENTRY(startup_32)
145 * _brk_end is set up to point to the first "safe" location. 149 * _brk_end is set up to point to the first "safe" location.
146 * Mappings are created both at virtual address 0 (identity mapping) 150 * Mappings are created both at virtual address 0 (identity mapping)
147 * and PAGE_OFFSET for up to _end. 151 * and PAGE_OFFSET for up to _end.
148 *
149 * Note that the stack is not yet set up!
150 */ 152 */
151#ifdef CONFIG_X86_PAE 153#ifdef CONFIG_X86_PAE
152 154
@@ -282,6 +284,9 @@ ENTRY(startup_32_smp)
282 movl %eax,%es 284 movl %eax,%es
283 movl %eax,%fs 285 movl %eax,%fs
284 movl %eax,%gs 286 movl %eax,%gs
287 movl pa(stack_start),%ecx
288 movl %eax,%ss
289 leal -__PAGE_OFFSET(%ecx),%esp
285#endif /* CONFIG_SMP */ 290#endif /* CONFIG_SMP */
286default_entry: 291default_entry:
287 292
@@ -347,8 +352,8 @@ default_entry:
347 movl %eax,%cr0 /* ..and set paging (PG) bit */ 352 movl %eax,%cr0 /* ..and set paging (PG) bit */
348 ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ 353 ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
3491: 3541:
350 /* Set up the stack pointer */ 355 /* Shift the stack pointer to a virtual address */
351 lss stack_start,%esp 356 addl $__PAGE_OFFSET, %esp
352 357
353/* 358/*
354 * Initialize eflags. Some BIOS's leave bits like NT set. This would 359 * Initialize eflags. Some BIOS's leave bits like NT set. This would
@@ -360,9 +365,7 @@ default_entry:
360 365
361#ifdef CONFIG_SMP 366#ifdef CONFIG_SMP
362 cmpb $0, ready 367 cmpb $0, ready
363 jz 1f /* Initial CPU cleans BSS */ 368 jnz checkCPUtype
364 jmp checkCPUtype
3651:
366#endif /* CONFIG_SMP */ 369#endif /* CONFIG_SMP */
367 370
368/* 371/*
@@ -470,14 +473,7 @@ is386: movl $2,%ecx # set MP
470 473
471 cld # gcc2 wants the direction flag cleared at all times 474 cld # gcc2 wants the direction flag cleared at all times
472 pushl $0 # fake return address for unwinder 475 pushl $0 # fake return address for unwinder
473#ifdef CONFIG_SMP
474 movb ready, %cl
475 movb $1, ready 476 movb $1, ready
476 cmpb $0,%cl # the first CPU calls start_kernel
477 je 1f
478 movl (stack_start), %esp
4791:
480#endif /* CONFIG_SMP */
481 jmp *(initial_code) 477 jmp *(initial_code)
482 478
483/* 479/*
@@ -670,15 +666,15 @@ ENTRY(initial_page_table)
670#endif 666#endif
671 667
672.data 668.data
669.balign 4
673ENTRY(stack_start) 670ENTRY(stack_start)
674 .long init_thread_union+THREAD_SIZE 671 .long init_thread_union+THREAD_SIZE
675 .long __BOOT_DS
676
677ready: .byte 0
678 672
679early_recursion_flag: 673early_recursion_flag:
680 .long 0 674 .long 0
681 675
676ready: .byte 0
677
682int_msg: 678int_msg:
683 .asciz "Unknown interrupt or fault at: %p %p %p\n" 679 .asciz "Unknown interrupt or fault at: %p %p %p\n"
684 680
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 48ff6dcffa0..9974d21048f 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -129,8 +129,7 @@ void __cpuinit irq_ctx_init(int cpu)
129 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), 129 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
130 THREAD_FLAGS, 130 THREAD_FLAGS,
131 THREAD_ORDER)); 131 THREAD_ORDER));
132 irqctx->tinfo.task = NULL; 132 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
133 irqctx->tinfo.exec_domain = NULL;
134 irqctx->tinfo.cpu = cpu; 133 irqctx->tinfo.cpu = cpu;
135 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 134 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
136 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 135 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
@@ -140,10 +139,8 @@ void __cpuinit irq_ctx_init(int cpu)
140 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), 139 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
141 THREAD_FLAGS, 140 THREAD_FLAGS,
142 THREAD_ORDER)); 141 THREAD_ORDER));
143 irqctx->tinfo.task = NULL; 142 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
144 irqctx->tinfo.exec_domain = NULL;
145 irqctx->tinfo.cpu = cpu; 143 irqctx->tinfo.cpu = cpu;
146 irqctx->tinfo.preempt_count = 0;
147 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 144 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
148 145
149 per_cpu(softirq_ctx, cpu) = irqctx; 146 per_cpu(softirq_ctx, cpu) = irqctx;
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 8f295609173..ab23f1ad4bf 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -37,20 +37,11 @@
37 37
38void *module_alloc(unsigned long size) 38void *module_alloc(unsigned long size)
39{ 39{
40 struct vm_struct *area; 40 if (PAGE_ALIGN(size) > MODULES_LEN)
41
42 if (!size)
43 return NULL;
44 size = PAGE_ALIGN(size);
45 if (size > MODULES_LEN)
46 return NULL; 41 return NULL;
47 42 return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
48 area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); 43 GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
49 if (!area) 44 -1, __builtin_return_address(0));
50 return NULL;
51
52 return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
53 PAGE_KERNEL_EXEC);
54} 45}
55 46
56/* Free memory returned from module_alloc */ 47/* Free memory returned from module_alloc */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c5b250011fd..869e1aeeb71 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -421,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = {
421 .set_pte = native_set_pte, 421 .set_pte = native_set_pte,
422 .set_pte_at = native_set_pte_at, 422 .set_pte_at = native_set_pte_at,
423 .set_pmd = native_set_pmd, 423 .set_pmd = native_set_pmd,
424 .set_pmd_at = native_set_pmd_at,
424 .pte_update = paravirt_nop, 425 .pte_update = paravirt_nop,
425 .pte_update_defer = paravirt_nop, 426 .pte_update_defer = paravirt_nop,
427 .pmd_update = paravirt_nop,
428 .pmd_update_defer = paravirt_nop,
426 429
427 .ptep_modify_prot_start = __ptep_modify_prot_start, 430 .ptep_modify_prot_start = __ptep_modify_prot_start,
428 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 431 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 09c08a1c706..e764fc05d70 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -14,6 +14,7 @@
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <trace/events/power.h> 15#include <trace/events/power.h>
16#include <linux/hw_breakpoint.h> 16#include <linux/hw_breakpoint.h>
17#include <asm/cpu.h>
17#include <asm/system.h> 18#include <asm/system.h>
18#include <asm/apic.h> 19#include <asm/apic.h>
19#include <asm/syscalls.h> 20#include <asm/syscalls.h>
@@ -22,11 +23,6 @@
22#include <asm/i387.h> 23#include <asm/i387.h>
23#include <asm/debugreg.h> 24#include <asm/debugreg.h>
24 25
25unsigned long idle_halt;
26EXPORT_SYMBOL(idle_halt);
27unsigned long idle_nomwait;
28EXPORT_SYMBOL(idle_nomwait);
29
30struct kmem_cache *task_xstate_cachep; 26struct kmem_cache *task_xstate_cachep;
31EXPORT_SYMBOL_GPL(task_xstate_cachep); 27EXPORT_SYMBOL_GPL(task_xstate_cachep);
32 28
@@ -327,7 +323,7 @@ long sys_execve(const char __user *name,
327/* 323/*
328 * Idle related variables and functions 324 * Idle related variables and functions
329 */ 325 */
330unsigned long boot_option_idle_override = 0; 326unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
331EXPORT_SYMBOL(boot_option_idle_override); 327EXPORT_SYMBOL(boot_option_idle_override);
332 328
333/* 329/*
@@ -386,6 +382,8 @@ void default_idle(void)
386 else 382 else
387 local_irq_enable(); 383 local_irq_enable();
388 current_thread_info()->status |= TS_POLLING; 384 current_thread_info()->status |= TS_POLLING;
385 trace_power_end(smp_processor_id());
386 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
389 } else { 387 } else {
390 local_irq_enable(); 388 local_irq_enable();
391 /* loop is done by the caller */ 389 /* loop is done by the caller */
@@ -443,8 +441,6 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
443 */ 441 */
444void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 442void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
445{ 443{
446 trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
447 trace_cpu_idle((ax>>4)+1, smp_processor_id());
448 if (!need_resched()) { 444 if (!need_resched()) {
449 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) 445 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
450 clflush((void *)&current_thread_info()->flags); 446 clflush((void *)&current_thread_info()->flags);
@@ -471,6 +467,8 @@ static void mwait_idle(void)
471 __sti_mwait(0, 0); 467 __sti_mwait(0, 0);
472 else 468 else
473 local_irq_enable(); 469 local_irq_enable();
470 trace_power_end(smp_processor_id());
471 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
474 } else 472 } else
475 local_irq_enable(); 473 local_irq_enable();
476} 474}
@@ -503,17 +501,16 @@ static void poll_idle(void)
503 * 501 *
504 * idle=mwait overrides this decision and forces the usage of mwait. 502 * idle=mwait overrides this decision and forces the usage of mwait.
505 */ 503 */
506static int __cpuinitdata force_mwait;
507 504
508#define MWAIT_INFO 0x05 505#define MWAIT_INFO 0x05
509#define MWAIT_ECX_EXTENDED_INFO 0x01 506#define MWAIT_ECX_EXTENDED_INFO 0x01
510#define MWAIT_EDX_C1 0xf0 507#define MWAIT_EDX_C1 0xf0
511 508
512static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) 509int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
513{ 510{
514 u32 eax, ebx, ecx, edx; 511 u32 eax, ebx, ecx, edx;
515 512
516 if (force_mwait) 513 if (boot_option_idle_override == IDLE_FORCE_MWAIT)
517 return 1; 514 return 1;
518 515
519 if (c->cpuid_level < MWAIT_INFO) 516 if (c->cpuid_level < MWAIT_INFO)
@@ -633,9 +630,10 @@ static int __init idle_setup(char *str)
633 if (!strcmp(str, "poll")) { 630 if (!strcmp(str, "poll")) {
634 printk("using polling idle threads.\n"); 631 printk("using polling idle threads.\n");
635 pm_idle = poll_idle; 632 pm_idle = poll_idle;
636 } else if (!strcmp(str, "mwait")) 633 boot_option_idle_override = IDLE_POLL;
637 force_mwait = 1; 634 } else if (!strcmp(str, "mwait")) {
638 else if (!strcmp(str, "halt")) { 635 boot_option_idle_override = IDLE_FORCE_MWAIT;
636 } else if (!strcmp(str, "halt")) {
639 /* 637 /*
640 * When the boot option of idle=halt is added, halt is 638 * When the boot option of idle=halt is added, halt is
641 * forced to be used for CPU idle. In such case CPU C2/C3 639 * forced to be used for CPU idle. In such case CPU C2/C3
@@ -644,8 +642,7 @@ static int __init idle_setup(char *str)
644 * the boot_option_idle_override. 642 * the boot_option_idle_override.
645 */ 643 */
646 pm_idle = default_idle; 644 pm_idle = default_idle;
647 idle_halt = 1; 645 boot_option_idle_override = IDLE_HALT;
648 return 0;
649 } else if (!strcmp(str, "nomwait")) { 646 } else if (!strcmp(str, "nomwait")) {
650 /* 647 /*
651 * If the boot option of "idle=nomwait" is added, 648 * If the boot option of "idle=nomwait" is added,
@@ -653,12 +650,10 @@ static int __init idle_setup(char *str)
653 * states. In such case it won't touch the variable 650 * states. In such case it won't touch the variable
654 * of boot_option_idle_override. 651 * of boot_option_idle_override.
655 */ 652 */
656 idle_nomwait = 1; 653 boot_option_idle_override = IDLE_NOMWAIT;
657 return 0;
658 } else 654 } else
659 return -1; 655 return -1;
660 656
661 boot_option_idle_override = 1;
662 return 0; 657 return 0;
663} 658}
664early_param("idle", idle_setup); 659early_param("idle", idle_setup);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4b9befa0e34..8d128783af4 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,8 +57,6 @@
57#include <asm/syscalls.h> 57#include <asm/syscalls.h>
58#include <asm/debugreg.h> 58#include <asm/debugreg.h>
59 59
60#include <trace/events/power.h>
61
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 61
64/* 62/*
@@ -113,8 +111,6 @@ void cpu_idle(void)
113 stop_critical_timings(); 111 stop_critical_timings();
114 pm_idle(); 112 pm_idle();
115 start_critical_timings(); 113 start_critical_timings();
116 trace_power_end(smp_processor_id());
117 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
118 } 114 }
119 tick_nohz_restart_sched_tick(); 115 tick_nohz_restart_sched_tick();
120 preempt_enable_no_resched(); 116 preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4c818a73839..bd387e8f73b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,8 +51,6 @@
51#include <asm/syscalls.h> 51#include <asm/syscalls.h>
52#include <asm/debugreg.h> 52#include <asm/debugreg.h>
53 53
54#include <trace/events/power.h>
55
56asmlinkage extern void ret_from_fork(void); 54asmlinkage extern void ret_from_fork(void);
57 55
58DEFINE_PER_CPU(unsigned long, old_rsp); 56DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -141,10 +139,6 @@ void cpu_idle(void)
141 pm_idle(); 139 pm_idle();
142 start_critical_timings(); 140 start_critical_timings();
143 141
144 trace_power_end(smp_processor_id());
145 trace_cpu_idle(PWR_EVENT_EXIT,
146 smp_processor_id());
147
148 /* In many cases the interrupt that ended idle 142 /* In many cases the interrupt that ended idle
149 has already called exit_idle. But some idle 143 has already called exit_idle. But some idle
150 loops can be woken up without interrupt. */ 144 loops can be woken up without interrupt. */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 763df77343d..03273b6c272 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -638,7 +638,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
638 * target processor state. 638 * target processor state.
639 */ 639 */
640 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, 640 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
641 (unsigned long)stack_start.sp); 641 stack_start);
642 642
643 /* 643 /*
644 * Run STARTUP IPI loop. 644 * Run STARTUP IPI loop.
@@ -785,7 +785,7 @@ do_rest:
785#endif 785#endif
786 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 786 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
787 initial_code = (unsigned long)start_secondary; 787 initial_code = (unsigned long)start_secondary;
788 stack_start.sp = (void *) c_idle.idle->thread.sp; 788 stack_start = c_idle.idle->thread.sp;
789 789
790 /* start_ip had better be page-aligned! */ 790 /* start_ip had better be page-aligned! */
791 start_ip = setup_trampoline(); 791 start_ip = setup_trampoline();
@@ -1402,8 +1402,9 @@ static inline void mwait_play_dead(void)
1402 unsigned int highest_subcstate = 0; 1402 unsigned int highest_subcstate = 0;
1403 int i; 1403 int i;
1404 void *mwait_ptr; 1404 void *mwait_ptr;
1405 struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
1405 1406
1406 if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_MWAIT)) 1407 if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)))
1407 return; 1408 return;
1408 if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH)) 1409 if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH))
1409 return; 1410 return;
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c2f1b26141e..998e972f3b1 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
133 pmd = pmd_alloc(&tboot_mm, pud, vaddr); 133 pmd = pmd_alloc(&tboot_mm, pud, vaddr);
134 if (!pmd) 134 if (!pmd)
135 return -1; 135 return -1;
136 pte = pte_alloc_map(&tboot_mm, pmd, vaddr); 136 pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
137 if (!pte) 137 if (!pte)
138 return -1; 138 return -1;
139 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); 139 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 823f79a17ad..ffe5755caa8 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -464,7 +464,7 @@ unsigned long native_calibrate_tsc(void)
464 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); 464 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
465 465
466 /* hpet or pmtimer available ? */ 466 /* hpet or pmtimer available ? */
467 if (!hpet && !ref1 && !ref2) 467 if (ref1 == ref2)
468 continue; 468 continue;
469 469
470 /* Check, whether the sampling was disturbed by an SMI */ 470 /* Check, whether the sampling was disturbed by an SMI */
@@ -935,7 +935,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)
935 tsc_stop = tsc_read_refs(&ref_stop, hpet); 935 tsc_stop = tsc_read_refs(&ref_stop, hpet);
936 936
937 /* hpet or pmtimer available ? */ 937 /* hpet or pmtimer available ? */
938 if (!hpet && !ref_start && !ref_stop) 938 if (ref_start == ref_stop)
939 goto out; 939 goto out;
940 940
941 /* Check, whether the sampling was disturbed by an SMI */ 941 /* Check, whether the sampling was disturbed by an SMI */
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 61fb9851962..863f8753ab0 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
179 if (pud_none_or_clear_bad(pud)) 179 if (pud_none_or_clear_bad(pud))
180 goto out; 180 goto out;
181 pmd = pmd_offset(pud, 0xA0000); 181 pmd = pmd_offset(pud, 0xA0000);
182 split_huge_page_pmd(mm, pmd);
182 if (pmd_none_or_clear_bad(pmd)) 183 if (pmd_none_or_clear_bad(pmd))
183 goto out; 184 goto out;
184 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); 185 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9cafbb49981..f02b8edc3d4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
554 return ret; 554 return ret;
555} 555}
556 556
557static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 557static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
558{ 558{
559 struct kvm_memory_slot *slot; 559 struct kvm_memory_slot *slot;
560 int host_level, level, max_level;
561
562 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 560 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
563 if (slot && slot->dirty_bitmap) 561 if (slot && slot->dirty_bitmap)
564 return PT_PAGE_TABLE_LEVEL; 562 return true;
563 return false;
564}
565
566static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567{
568 int host_level, level, max_level;
565 569
566 host_level = host_mapping_level(vcpu->kvm, large_gfn); 570 host_level = host_mapping_level(vcpu->kvm, large_gfn);
567 571
@@ -941,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
941 return young; 945 return young;
942} 946}
943 947
948static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
949 unsigned long data)
950{
951 u64 *spte;
952 int young = 0;
953
954 /*
955 * If there's no access bit in the secondary pte set by the
956 * hardware it's up to gup-fast/gup to set the access bit in
957 * the primary pte or in the page structure.
958 */
959 if (!shadow_accessed_mask)
960 goto out;
961
962 spte = rmap_next(kvm, rmapp, NULL);
963 while (spte) {
964 u64 _spte = *spte;
965 BUG_ON(!(_spte & PT_PRESENT_MASK));
966 young = _spte & PT_ACCESSED_MASK;
967 if (young) {
968 young = 1;
969 break;
970 }
971 spte = rmap_next(kvm, rmapp, spte);
972 }
973out:
974 return young;
975}
976
944#define RMAP_RECYCLE_THRESHOLD 1000 977#define RMAP_RECYCLE_THRESHOLD 1000
945 978
946static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 979static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -961,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
961 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 994 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
962} 995}
963 996
997int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
998{
999 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1000}
1001
964#ifdef MMU_DEBUG 1002#ifdef MMU_DEBUG
965static int is_empty_shadow_page(u64 *spt) 1003static int is_empty_shadow_page(u64 *spt)
966{ 1004{
@@ -2281,6 +2319,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2281 return 1; 2319 return 1;
2282} 2320}
2283 2321
2322static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2323 gfn_t *gfnp, pfn_t *pfnp, int *levelp)
2324{
2325 pfn_t pfn = *pfnp;
2326 gfn_t gfn = *gfnp;
2327 int level = *levelp;
2328
2329 /*
2330 * Check if it's a transparent hugepage. If this would be an
2331 * hugetlbfs page, level wouldn't be set to
2332 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2333 * here.
2334 */
2335 if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2336 level == PT_PAGE_TABLE_LEVEL &&
2337 PageTransCompound(pfn_to_page(pfn)) &&
2338 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
2339 unsigned long mask;
2340 /*
2341 * mmu_notifier_retry was successful and we hold the
2342 * mmu_lock here, so the pmd can't become splitting
2343 * from under us, and in turn
2344 * __split_huge_page_refcount() can't run from under
2345 * us and we can safely transfer the refcount from
2346 * PG_tail to PG_head as we switch the pfn to tail to
2347 * head.
2348 */
2349 *levelp = level = PT_DIRECTORY_LEVEL;
2350 mask = KVM_PAGES_PER_HPAGE(level) - 1;
2351 VM_BUG_ON((gfn & mask) != (pfn & mask));
2352 if (pfn & mask) {
2353 gfn &= ~mask;
2354 *gfnp = gfn;
2355 kvm_release_pfn_clean(pfn);
2356 pfn &= ~mask;
2357 if (!get_page_unless_zero(pfn_to_page(pfn)))
2358 BUG();
2359 *pfnp = pfn;
2360 }
2361 }
2362}
2363
2284static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2364static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2285 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2365 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2286 2366
@@ -2289,20 +2369,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2289{ 2369{
2290 int r; 2370 int r;
2291 int level; 2371 int level;
2372 int force_pt_level;
2292 pfn_t pfn; 2373 pfn_t pfn;
2293 unsigned long mmu_seq; 2374 unsigned long mmu_seq;
2294 bool map_writable; 2375 bool map_writable;
2295 2376
2296 level = mapping_level(vcpu, gfn); 2377 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2297 2378 if (likely(!force_pt_level)) {
2298 /* 2379 level = mapping_level(vcpu, gfn);
2299 * This path builds a PAE pagetable - so we can map 2mb pages at 2380 /*
2300 * maximum. Therefore check if the level is larger than that. 2381 * This path builds a PAE pagetable - so we can map
2301 */ 2382 * 2mb pages at maximum. Therefore check if the level
2302 if (level > PT_DIRECTORY_LEVEL) 2383 * is larger than that.
2303 level = PT_DIRECTORY_LEVEL; 2384 */
2385 if (level > PT_DIRECTORY_LEVEL)
2386 level = PT_DIRECTORY_LEVEL;
2304 2387
2305 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2388 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2389 } else
2390 level = PT_PAGE_TABLE_LEVEL;
2306 2391
2307 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2392 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2308 smp_rmb(); 2393 smp_rmb();
@@ -2318,6 +2403,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2318 if (mmu_notifier_retry(vcpu, mmu_seq)) 2403 if (mmu_notifier_retry(vcpu, mmu_seq))
2319 goto out_unlock; 2404 goto out_unlock;
2320 kvm_mmu_free_some_pages(vcpu); 2405 kvm_mmu_free_some_pages(vcpu);
2406 if (likely(!force_pt_level))
2407 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2321 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, 2408 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
2322 prefault); 2409 prefault);
2323 spin_unlock(&vcpu->kvm->mmu_lock); 2410 spin_unlock(&vcpu->kvm->mmu_lock);
@@ -2655,6 +2742,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2655 pfn_t pfn; 2742 pfn_t pfn;
2656 int r; 2743 int r;
2657 int level; 2744 int level;
2745 int force_pt_level;
2658 gfn_t gfn = gpa >> PAGE_SHIFT; 2746 gfn_t gfn = gpa >> PAGE_SHIFT;
2659 unsigned long mmu_seq; 2747 unsigned long mmu_seq;
2660 int write = error_code & PFERR_WRITE_MASK; 2748 int write = error_code & PFERR_WRITE_MASK;
@@ -2667,9 +2755,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2667 if (r) 2755 if (r)
2668 return r; 2756 return r;
2669 2757
2670 level = mapping_level(vcpu, gfn); 2758 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2671 2759 if (likely(!force_pt_level)) {
2672 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2760 level = mapping_level(vcpu, gfn);
2761 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2762 } else
2763 level = PT_PAGE_TABLE_LEVEL;
2673 2764
2674 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2765 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2675 smp_rmb(); 2766 smp_rmb();
@@ -2684,6 +2775,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2684 if (mmu_notifier_retry(vcpu, mmu_seq)) 2775 if (mmu_notifier_retry(vcpu, mmu_seq))
2685 goto out_unlock; 2776 goto out_unlock;
2686 kvm_mmu_free_some_pages(vcpu); 2777 kvm_mmu_free_some_pages(vcpu);
2778 if (likely(!force_pt_level))
2779 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2687 r = __direct_map(vcpu, gpa, write, map_writable, 2780 r = __direct_map(vcpu, gpa, write, map_writable,
2688 level, gfn, pfn, prefault); 2781 level, gfn, pfn, prefault);
2689 spin_unlock(&vcpu->kvm->mmu_lock); 2782 spin_unlock(&vcpu->kvm->mmu_lock);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 53210f1e94c..6bccc24c418 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
550 int r; 550 int r;
551 pfn_t pfn; 551 pfn_t pfn;
552 int level = PT_PAGE_TABLE_LEVEL; 552 int level = PT_PAGE_TABLE_LEVEL;
553 int force_pt_level;
553 unsigned long mmu_seq; 554 unsigned long mmu_seq;
554 bool map_writable; 555 bool map_writable;
555 556
@@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
577 return 0; 578 return 0;
578 } 579 }
579 580
580 if (walker.level >= PT_DIRECTORY_LEVEL) { 581 if (walker.level >= PT_DIRECTORY_LEVEL)
582 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
583 else
584 force_pt_level = 1;
585 if (!force_pt_level) {
581 level = min(walker.level, mapping_level(vcpu, walker.gfn)); 586 level = min(walker.level, mapping_level(vcpu, walker.gfn));
582 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); 587 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
583 } 588 }
@@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
599 604
600 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 605 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
601 kvm_mmu_free_some_pages(vcpu); 606 kvm_mmu_free_some_pages(vcpu);
607 if (!force_pt_level)
608 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
602 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 609 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
603 level, &write_pt, pfn, map_writable, prefault); 610 level, &write_pt, pfn, map_writable, prefault);
604 (void)sptep; 611 (void)sptep;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 25bd1bc5aad..54ce246a383 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1150,8 +1150,8 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1150 kvm_load_ldt(svm->host.ldt); 1150 kvm_load_ldt(svm->host.ldt);
1151#ifdef CONFIG_X86_64 1151#ifdef CONFIG_X86_64
1152 loadsegment(fs, svm->host.fs); 1152 loadsegment(fs, svm->host.fs);
1153 load_gs_index(svm->host.gs);
1154 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); 1153 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1154 load_gs_index(svm->host.gs);
1155#else 1155#else
1156 loadsegment(gs, svm->host.gs); 1156 loadsegment(gs, svm->host.gs);
1157#endif 1157#endif
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 38718041efc..6e121a2a49e 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,6 +2,7 @@ config LGUEST_GUEST
2 bool "Lguest guest support" 2 bool "Lguest guest support"
3 select PARAVIRT 3 select PARAVIRT
4 depends on X86_32 4 depends on X86_32
5 select VIRTUALIZATION
5 select VIRTIO 6 select VIRTIO
6 select VIRTIO_RING 7 select VIRTIO_RING
7 select VIRTIO_CONSOLE 8 select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4996cf5f73a..eba687f0cc0 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -824,7 +824,7 @@ static void __init lguest_init_IRQ(void)
824 824
825 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 825 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
826 /* Some systems map "vectors" to interrupts weirdly. Not us! */ 826 /* Some systems map "vectors" to interrupts weirdly. Not us! */
827 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; 827 __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
828 if (i != SYSCALL_VECTOR) 828 if (i != SYSCALL_VECTOR)
829 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 829 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
830 } 830 }
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799..dbe34b93137 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/vmstat.h> 9#include <linux/vmstat.h>
10#include <linux/highmem.h> 10#include <linux/highmem.h>
11#include <linux/swap.h>
11 12
12#include <asm/pgtable.h> 13#include <asm/pgtable.h>
13 14
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 90 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte); 91 page = pte_page(pte);
91 get_page(page); 92 get_page(page);
93 SetPageReferenced(page);
92 pages[*nr] = page; 94 pages[*nr] = page;
93 (*nr)++; 95 (*nr)++;
94 96
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr)
103 VM_BUG_ON(page != compound_head(page)); 105 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0); 106 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count); 107 atomic_add(nr, &page->_count);
108 SetPageReferenced(page);
109}
110
111static inline void get_huge_page_tail(struct page *page)
112{
113 /*
114 * __split_huge_page_refcount() cannot run
115 * from under us.
116 */
117 VM_BUG_ON(atomic_read(&page->_count) < 0);
118 atomic_inc(&page->_count);
106} 119}
107 120
108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, 121static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
128 do { 141 do {
129 VM_BUG_ON(compound_head(page) != head); 142 VM_BUG_ON(compound_head(page) != head);
130 pages[*nr] = page; 143 pages[*nr] = page;
144 if (PageTail(page))
145 get_huge_page_tail(page);
131 (*nr)++; 146 (*nr)++;
132 page++; 147 page++;
133 refs++; 148 refs++;
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
148 pmd_t pmd = *pmdp; 163 pmd_t pmd = *pmdp;
149 164
150 next = pmd_addr_end(addr, end); 165 next = pmd_addr_end(addr, end);
151 if (pmd_none(pmd)) 166 /*
167 * The pmd_trans_splitting() check below explains why
168 * pmdp_splitting_flush has to flush the tlb, to stop
169 * this gup-fast code from running while we set the
170 * splitting bit in the pmd. Returning zero will take
171 * the slow path that will call wait_split_huge_page()
172 * if the pmd is still in splitting state. gup-fast
173 * can't because it has irq disabled and
174 * wait_split_huge_page() would never return as the
175 * tlb flush IPI wouldn't run.
176 */
177 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
152 return 0; 178 return 0;
153 if (unlikely(pmd_large(pmd))) { 179 if (unlikely(pmd_large(pmd))) {
154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) 180 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 787c52ca49c..ebf6d7887a3 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -2,6 +2,28 @@
2#include <linux/topology.h> 2#include <linux/topology.h>
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/bootmem.h> 4#include <linux/bootmem.h>
5#include <asm/numa.h>
6#include <asm/acpi.h>
7
8int __initdata numa_off;
9
10static __init int numa_setup(char *opt)
11{
12 if (!opt)
13 return -EINVAL;
14 if (!strncmp(opt, "off", 3))
15 numa_off = 1;
16#ifdef CONFIG_NUMA_EMU
17 if (!strncmp(opt, "fake=", 5))
18 numa_emu_cmdline(opt + 5);
19#endif
20#ifdef CONFIG_ACPI_NUMA
21 if (!strncmp(opt, "noacpi", 6))
22 acpi_numa = -1;
23#endif
24 return 0;
25}
26early_param("numa", numa_setup);
5 27
6/* 28/*
7 * Which logical CPUs are on which nodes 29 * Which logical CPUs are on which nodes
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 1e72102e80c..95ea1551eeb 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -30,7 +30,6 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
30 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 30 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
31}; 31};
32 32
33int numa_off __initdata;
34static unsigned long __initdata nodemap_addr; 33static unsigned long __initdata nodemap_addr;
35static unsigned long __initdata nodemap_size; 34static unsigned long __initdata nodemap_size;
36 35
@@ -263,6 +262,11 @@ static struct bootnode nodes[MAX_NUMNODES] __initdata;
263static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata; 262static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
264static char *cmdline __initdata; 263static char *cmdline __initdata;
265 264
265void __init numa_emu_cmdline(char *str)
266{
267 cmdline = str;
268}
269
266static int __init setup_physnodes(unsigned long start, unsigned long end, 270static int __init setup_physnodes(unsigned long start, unsigned long end,
267 int acpi, int amd) 271 int acpi, int amd)
268{ 272{
@@ -670,24 +674,6 @@ unsigned long __init numa_free_all_bootmem(void)
670 return pages; 674 return pages;
671} 675}
672 676
673static __init int numa_setup(char *opt)
674{
675 if (!opt)
676 return -EINVAL;
677 if (!strncmp(opt, "off", 3))
678 numa_off = 1;
679#ifdef CONFIG_NUMA_EMU
680 if (!strncmp(opt, "fake=", 5))
681 cmdline = opt + 5;
682#endif
683#ifdef CONFIG_ACPI_NUMA
684 if (!strncmp(opt, "noacpi", 6))
685 acpi_numa = -1;
686#endif
687 return 0;
688}
689early_param("numa", numa_setup);
690
691#ifdef CONFIG_NUMA 677#ifdef CONFIG_NUMA
692 678
693static __init int find_near_online_node(int node) 679static __init int find_near_online_node(int node)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 8b830ca14ac..d343b3c81f3 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -256,7 +256,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
256 unsigned long pfn) 256 unsigned long pfn)
257{ 257{
258 pgprot_t forbidden = __pgprot(0); 258 pgprot_t forbidden = __pgprot(0);
259 pgprot_t required = __pgprot(0);
260 259
261 /* 260 /*
262 * The BIOS area between 640k and 1Mb needs to be executable for 261 * The BIOS area between 640k and 1Mb needs to be executable for
@@ -282,12 +281,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
282 if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, 281 if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
283 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) 282 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
284 pgprot_val(forbidden) |= _PAGE_RW; 283 pgprot_val(forbidden) |= _PAGE_RW;
285 /*
286 * .data and .bss should always be writable.
287 */
288 if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
289 within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
290 pgprot_val(required) |= _PAGE_RW;
291 284
292#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) 285#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
293 /* 286 /*
@@ -327,7 +320,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
327#endif 320#endif
328 321
329 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 322 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
330 prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
331 323
332 return prot; 324 return prot;
333} 325}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8be8c7d7bc8..500242d3c96 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
320 return changed; 320 return changed;
321} 321}
322 322
323#ifdef CONFIG_TRANSPARENT_HUGEPAGE
324int pmdp_set_access_flags(struct vm_area_struct *vma,
325 unsigned long address, pmd_t *pmdp,
326 pmd_t entry, int dirty)
327{
328 int changed = !pmd_same(*pmdp, entry);
329
330 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
331
332 if (changed && dirty) {
333 *pmdp = entry;
334 pmd_update_defer(vma->vm_mm, address, pmdp);
335 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
336 }
337
338 return changed;
339}
340#endif
341
323int ptep_test_and_clear_young(struct vm_area_struct *vma, 342int ptep_test_and_clear_young(struct vm_area_struct *vma,
324 unsigned long addr, pte_t *ptep) 343 unsigned long addr, pte_t *ptep)
325{ 344{
@@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
335 return ret; 354 return ret;
336} 355}
337 356
357#ifdef CONFIG_TRANSPARENT_HUGEPAGE
358int pmdp_test_and_clear_young(struct vm_area_struct *vma,
359 unsigned long addr, pmd_t *pmdp)
360{
361 int ret = 0;
362
363 if (pmd_young(*pmdp))
364 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
365 (unsigned long *)pmdp);
366
367 if (ret)
368 pmd_update(vma->vm_mm, addr, pmdp);
369
370 return ret;
371}
372#endif
373
338int ptep_clear_flush_young(struct vm_area_struct *vma, 374int ptep_clear_flush_young(struct vm_area_struct *vma,
339 unsigned long address, pte_t *ptep) 375 unsigned long address, pte_t *ptep)
340{ 376{
@@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
347 return young; 383 return young;
348} 384}
349 385
386#ifdef CONFIG_TRANSPARENT_HUGEPAGE
387int pmdp_clear_flush_young(struct vm_area_struct *vma,
388 unsigned long address, pmd_t *pmdp)
389{
390 int young;
391
392 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
393
394 young = pmdp_test_and_clear_young(vma, address, pmdp);
395 if (young)
396 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
397
398 return young;
399}
400
401void pmdp_splitting_flush(struct vm_area_struct *vma,
402 unsigned long address, pmd_t *pmdp)
403{
404 int set;
405 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
406 set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
407 (unsigned long *)pmdp);
408 if (set) {
409 pmd_update(vma->vm_mm, address, pmdp);
410 /* need tlb flush only to serialize against gup-fast */
411 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
412 }
413}
414#endif
415
350/** 416/**
351 * reserve_top_address - reserves a hole in the top of kernel address space 417 * reserve_top_address - reserves a hole in the top of kernel address space
352 * @reserve - size of hole to reserve 418 * @reserve - size of hole to reserve
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index f16434568a5..ae96e7b8051 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -59,7 +59,6 @@ static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
59static int __initdata num_memory_chunks; /* total number of memory chunks */ 59static int __initdata num_memory_chunks; /* total number of memory chunks */
60static u8 __initdata apicid_to_pxm[MAX_APICID]; 60static u8 __initdata apicid_to_pxm[MAX_APICID];
61 61
62int numa_off __initdata;
63int acpi_numa __initdata; 62int acpi_numa __initdata;
64 63
65static __init void bad_srat(void) 64static __init void bad_srat(void)
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index 0846a5bbbfb..ab8269b0da2 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -9,6 +9,7 @@
9 * option) any later version. 9 * option) any later version.
10 */ 10 */
11 11
12#include <linux/acpi.h>
12#include <linux/delay.h> 13#include <linux/delay.h>
13#include <linux/dmi.h> 14#include <linux/dmi.h>
14#include <linux/pci.h> 15#include <linux/pci.h>
@@ -25,12 +26,14 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
25 u8 fbus, lbus; 26 u8 fbus, lbus;
26 int i; 27 int i;
27 28
29#ifdef CONFIG_ACPI
28 /* 30 /*
29 * The x86_pci_root_bus_res_quirks() function already refuses to use 31 * We should get host bridge information from ACPI unless the BIOS
30 * this information if ACPI _CRS was used. Therefore, we don't bother 32 * doesn't support it.
31 * checking if ACPI is enabled, and just generate the information
32 * for both the ACPI _CRS and no ACPI cases.
33 */ 33 */
34 if (acpi_os_get_root_pointer())
35 return;
36#endif
34 37
35 info = &pci_root_info[pci_root_num]; 38 info = &pci_root_info[pci_root_num];
36 pci_root_num++; 39 pci_root_num++;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index f7c8a399978..5fe75026ecc 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -22,6 +22,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
22 22
23unsigned int pci_early_dump_regs; 23unsigned int pci_early_dump_regs;
24static int pci_bf_sort; 24static int pci_bf_sort;
25static int smbios_type_b1_flag;
25int pci_routeirq; 26int pci_routeirq;
26int noioapicquirk; 27int noioapicquirk;
27#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS 28#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
@@ -185,6 +186,39 @@ static int __devinit set_bf_sort(const struct dmi_system_id *d)
185 return 0; 186 return 0;
186} 187}
187 188
189static void __devinit read_dmi_type_b1(const struct dmi_header *dm,
190 void *private_data)
191{
192 u8 *d = (u8 *)dm + 4;
193
194 if (dm->type != 0xB1)
195 return;
196 switch (((*(u32 *)d) >> 9) & 0x03) {
197 case 0x00:
198 printk(KERN_INFO "dmi type 0xB1 record - unknown flag\n");
199 break;
200 case 0x01: /* set pci=bfsort */
201 smbios_type_b1_flag = 1;
202 break;
203 case 0x02: /* do not set pci=bfsort */
204 smbios_type_b1_flag = 2;
205 break;
206 default:
207 break;
208 }
209}
210
211static int __devinit find_sort_method(const struct dmi_system_id *d)
212{
213 dmi_walk(read_dmi_type_b1, NULL);
214
215 if (smbios_type_b1_flag == 1) {
216 set_bf_sort(d);
217 return 0;
218 }
219 return -1;
220}
221
188/* 222/*
189 * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus) 223 * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus)
190 */ 224 */
@@ -213,6 +247,13 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
213 }, 247 },
214#endif /* __i386__ */ 248#endif /* __i386__ */
215 { 249 {
250 .callback = find_sort_method,
251 .ident = "Dell System",
252 .matches = {
253 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
254 },
255 },
256 {
216 .callback = set_bf_sort, 257 .callback = set_bf_sort,
217 .ident = "Dell PowerEdge 1950", 258 .ident = "Dell PowerEdge 1950",
218 .matches = { 259 .matches = {
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 9f9bfb705cf..87e6c832311 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -589,7 +589,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
589 case PCI_DEVICE_ID_INTEL_ICH10_1: 589 case PCI_DEVICE_ID_INTEL_ICH10_1:
590 case PCI_DEVICE_ID_INTEL_ICH10_2: 590 case PCI_DEVICE_ID_INTEL_ICH10_2:
591 case PCI_DEVICE_ID_INTEL_ICH10_3: 591 case PCI_DEVICE_ID_INTEL_ICH10_3:
592 case PCI_DEVICE_ID_INTEL_PATSBURG_LPC: 592 case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_0:
593 case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_1:
593 r->name = "PIIX/ICH"; 594 r->name = "PIIX/ICH";
594 r->get = pirq_piix_get; 595 r->get = pirq_piix_get;
595 r->set = pirq_piix_set; 596 r->set = pirq_piix_set;
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
index f5442c03abc..127775696d6 100644
--- a/arch/x86/platform/olpc/olpc-xo1.c
+++ b/arch/x86/platform/olpc/olpc-xo1.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * Support for features of the OLPC XO-1 laptop 2 * Support for features of the OLPC XO-1 laptop
3 * 3 *
4 * Copyright (C) 2010 Andres Salomon <dilinger@queued.net>
4 * Copyright (C) 2010 One Laptop per Child 5 * Copyright (C) 2010 One Laptop per Child
5 * Copyright (C) 2006 Red Hat, Inc. 6 * Copyright (C) 2006 Red Hat, Inc.
6 * Copyright (C) 2006 Advanced Micro Devices, Inc. 7 * Copyright (C) 2006 Advanced Micro Devices, Inc.
@@ -12,8 +13,6 @@
12 */ 13 */
13 14
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/pci.h>
16#include <linux/pci_ids.h>
17#include <linux/platform_device.h> 16#include <linux/platform_device.h>
18#include <linux/pm.h> 17#include <linux/pm.h>
19 18
@@ -22,9 +21,6 @@
22 21
23#define DRV_NAME "olpc-xo1" 22#define DRV_NAME "olpc-xo1"
24 23
25#define PMS_BAR 4
26#define ACPI_BAR 5
27
28/* PMC registers (PMS block) */ 24/* PMC registers (PMS block) */
29#define PM_SCLK 0x10 25#define PM_SCLK 0x10
30#define PM_IN_SLPCTL 0x20 26#define PM_IN_SLPCTL 0x20
@@ -57,65 +53,67 @@ static void xo1_power_off(void)
57 outl(0x00002000, acpi_base + PM1_CNT); 53 outl(0x00002000, acpi_base + PM1_CNT);
58} 54}
59 55
60/* Read the base addresses from the PCI BAR info */ 56static int __devinit olpc_xo1_probe(struct platform_device *pdev)
61static int __devinit setup_bases(struct pci_dev *pdev)
62{ 57{
63 int r; 58 struct resource *res;
64 59
65 r = pci_enable_device_io(pdev); 60 /* don't run on non-XOs */
66 if (r) { 61 if (!machine_is_olpc())
67 dev_err(&pdev->dev, "can't enable device IO\n"); 62 return -ENODEV;
68 return r;
69 }
70 63
71 r = pci_request_region(pdev, ACPI_BAR, DRV_NAME); 64 res = platform_get_resource(pdev, IORESOURCE_IO, 0);
72 if (r) { 65 if (!res) {
73 dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR); 66 dev_err(&pdev->dev, "can't fetch device resource info\n");
74 return r; 67 return -EIO;
75 } 68 }
76 69
77 r = pci_request_region(pdev, PMS_BAR, DRV_NAME); 70 if (!request_region(res->start, resource_size(res), DRV_NAME)) {
78 if (r) { 71 dev_err(&pdev->dev, "can't request region\n");
79 dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR); 72 return -EIO;
80 pci_release_region(pdev, ACPI_BAR);
81 return r;
82 } 73 }
83 74
84 acpi_base = pci_resource_start(pdev, ACPI_BAR); 75 if (strcmp(pdev->name, "cs5535-pms") == 0)
85 pms_base = pci_resource_start(pdev, PMS_BAR); 76 pms_base = res->start;
77 else if (strcmp(pdev->name, "cs5535-acpi") == 0)
78 acpi_base = res->start;
79
80 /* If we have both addresses, we can override the poweroff hook */
81 if (pms_base && acpi_base) {
82 pm_power_off = xo1_power_off;
83 printk(KERN_INFO "OLPC XO-1 support registered\n");
84 }
86 85
87 return 0; 86 return 0;
88} 87}
89 88
90static int __devinit olpc_xo1_probe(struct platform_device *pdev) 89static int __devexit olpc_xo1_remove(struct platform_device *pdev)
91{ 90{
92 struct pci_dev *pcidev; 91 struct resource *r;
93 int r;
94
95 pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA,
96 NULL);
97 if (!pdev)
98 return -ENODEV;
99
100 r = setup_bases(pcidev);
101 if (r)
102 return r;
103 92
104 pm_power_off = xo1_power_off; 93 r = platform_get_resource(pdev, IORESOURCE_IO, 0);
94 release_region(r->start, resource_size(r));
105 95
106 printk(KERN_INFO "OLPC XO-1 support registered\n"); 96 if (strcmp(pdev->name, "cs5535-pms") == 0)
107 return 0; 97 pms_base = 0;
108} 98 else if (strcmp(pdev->name, "cs5535-acpi") == 0)
99 acpi_base = 0;
109 100
110static int __devexit olpc_xo1_remove(struct platform_device *pdev)
111{
112 pm_power_off = NULL; 101 pm_power_off = NULL;
113 return 0; 102 return 0;
114} 103}
115 104
116static struct platform_driver olpc_xo1_driver = { 105static struct platform_driver cs5535_pms_drv = {
106 .driver = {
107 .name = "cs5535-pms",
108 .owner = THIS_MODULE,
109 },
110 .probe = olpc_xo1_probe,
111 .remove = __devexit_p(olpc_xo1_remove),
112};
113
114static struct platform_driver cs5535_acpi_drv = {
117 .driver = { 115 .driver = {
118 .name = DRV_NAME, 116 .name = "cs5535-acpi",
119 .owner = THIS_MODULE, 117 .owner = THIS_MODULE,
120 }, 118 },
121 .probe = olpc_xo1_probe, 119 .probe = olpc_xo1_probe,
@@ -124,12 +122,23 @@ static struct platform_driver olpc_xo1_driver = {
124 122
125static int __init olpc_xo1_init(void) 123static int __init olpc_xo1_init(void)
126{ 124{
127 return platform_driver_register(&olpc_xo1_driver); 125 int r;
126
127 r = platform_driver_register(&cs5535_pms_drv);
128 if (r)
129 return r;
130
131 r = platform_driver_register(&cs5535_acpi_drv);
132 if (r)
133 platform_driver_unregister(&cs5535_pms_drv);
134
135 return r;
128} 136}
129 137
130static void __exit olpc_xo1_exit(void) 138static void __exit olpc_xo1_exit(void)
131{ 139{
132 platform_driver_unregister(&olpc_xo1_driver); 140 platform_driver_unregister(&cs5535_acpi_drv);
141 platform_driver_unregister(&cs5535_pms_drv);
133} 142}
134 143
135MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); 144MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 77938515891..17c565de3d6 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp)
12 12
13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
14 time.o xen-asm.o xen-asm_$(BITS).o \ 14 time.o xen-asm.o xen-asm_$(BITS).o \
15 grant-table.o suspend.o platform-pci-unplug.o 15 grant-table.o suspend.o platform-pci-unplug.o \
16 p2m.o
16 17
17obj-$(CONFIG_SMP) += smp.o 18obj-$(CONFIG_SMP) += smp.o
18obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o 19obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7e8d3bc80af..50542efe45f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1194,7 +1194,7 @@ asmlinkage void __init xen_start_kernel(void)
1194 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1194 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1195 1195
1196 local_irq_disable(); 1196 local_irq_disable();
1197 early_boot_irqs_off(); 1197 early_boot_irqs_disabled = true;
1198 1198
1199 memblock_init(); 1199 memblock_init();
1200 1200
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 9d30105a0c4..6a6fe893964 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -126,7 +126,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
126#endif 126#endif
127}; 127};
128 128
129void __init xen_init_irq_ops() 129void __init xen_init_irq_ops(void)
130{ 130{
131 pv_irq_ops = xen_irq_ops; 131 pv_irq_ops = xen_irq_ops;
132 x86_init.irqs.intr_init = xen_init_IRQ; 132 x86_init.irqs.intr_init = xen_init_IRQ;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 44924e551fd..5e92b61ad57 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -173,371 +173,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
173 */ 173 */
174#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) 174#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
175 175
176/*
177 * Xen leaves the responsibility for maintaining p2m mappings to the
178 * guests themselves, but it must also access and update the p2m array
179 * during suspend/resume when all the pages are reallocated.
180 *
181 * The p2m table is logically a flat array, but we implement it as a
182 * three-level tree to allow the address space to be sparse.
183 *
184 * Xen
185 * |
186 * p2m_top p2m_top_mfn
187 * / \ / \
188 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
189 * / \ / \ / /
190 * p2m p2m p2m p2m p2m p2m p2m ...
191 *
192 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
193 *
194 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
195 * maximum representable pseudo-physical address space is:
196 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
197 *
198 * P2M_PER_PAGE depends on the architecture, as a mfn is always
199 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
200 * 512 and 1024 entries respectively.
201 */
202
203unsigned long xen_max_p2m_pfn __read_mostly;
204
205#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
206#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
207#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
208
209#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
210
211/* Placeholders for holes in the address space */
212static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
213static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
214static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
215
216static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
217static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
218static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
219
220RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
221RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
222
223static inline unsigned p2m_top_index(unsigned long pfn)
224{
225 BUG_ON(pfn >= MAX_P2M_PFN);
226 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
227}
228
229static inline unsigned p2m_mid_index(unsigned long pfn)
230{
231 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
232}
233
234static inline unsigned p2m_index(unsigned long pfn)
235{
236 return pfn % P2M_PER_PAGE;
237}
238
239static void p2m_top_init(unsigned long ***top)
240{
241 unsigned i;
242
243 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
244 top[i] = p2m_mid_missing;
245}
246
247static void p2m_top_mfn_init(unsigned long *top)
248{
249 unsigned i;
250
251 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
252 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
253}
254
255static void p2m_top_mfn_p_init(unsigned long **top)
256{
257 unsigned i;
258
259 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
260 top[i] = p2m_mid_missing_mfn;
261}
262
263static void p2m_mid_init(unsigned long **mid)
264{
265 unsigned i;
266
267 for (i = 0; i < P2M_MID_PER_PAGE; i++)
268 mid[i] = p2m_missing;
269}
270
271static void p2m_mid_mfn_init(unsigned long *mid)
272{
273 unsigned i;
274
275 for (i = 0; i < P2M_MID_PER_PAGE; i++)
276 mid[i] = virt_to_mfn(p2m_missing);
277}
278
279static void p2m_init(unsigned long *p2m)
280{
281 unsigned i;
282
283 for (i = 0; i < P2M_MID_PER_PAGE; i++)
284 p2m[i] = INVALID_P2M_ENTRY;
285}
286
287/*
288 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
289 *
290 * This is called both at boot time, and after resuming from suspend:
291 * - At boot time we're called very early, and must use extend_brk()
292 * to allocate memory.
293 *
294 * - After resume we're called from within stop_machine, but the mfn
295 * tree should alreay be completely allocated.
296 */
297void xen_build_mfn_list_list(void)
298{
299 unsigned long pfn;
300
301 /* Pre-initialize p2m_top_mfn to be completely missing */
302 if (p2m_top_mfn == NULL) {
303 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
304 p2m_mid_mfn_init(p2m_mid_missing_mfn);
305
306 p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
307 p2m_top_mfn_p_init(p2m_top_mfn_p);
308
309 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
310 p2m_top_mfn_init(p2m_top_mfn);
311 } else {
312 /* Reinitialise, mfn's all change after migration */
313 p2m_mid_mfn_init(p2m_mid_missing_mfn);
314 }
315
316 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
317 unsigned topidx = p2m_top_index(pfn);
318 unsigned mididx = p2m_mid_index(pfn);
319 unsigned long **mid;
320 unsigned long *mid_mfn_p;
321
322 mid = p2m_top[topidx];
323 mid_mfn_p = p2m_top_mfn_p[topidx];
324
325 /* Don't bother allocating any mfn mid levels if
326 * they're just missing, just update the stored mfn,
327 * since all could have changed over a migrate.
328 */
329 if (mid == p2m_mid_missing) {
330 BUG_ON(mididx);
331 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
332 p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
333 pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
334 continue;
335 }
336
337 if (mid_mfn_p == p2m_mid_missing_mfn) {
338 /*
339 * XXX boot-time only! We should never find
340 * missing parts of the mfn tree after
341 * runtime. extend_brk() will BUG if we call
342 * it too late.
343 */
344 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
345 p2m_mid_mfn_init(mid_mfn_p);
346
347 p2m_top_mfn_p[topidx] = mid_mfn_p;
348 }
349
350 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
351 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
352 }
353}
354
355void xen_setup_mfn_list_list(void)
356{
357 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
358
359 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
360 virt_to_mfn(p2m_top_mfn);
361 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
362}
363
364/* Set up p2m_top to point to the domain-builder provided p2m pages */
365void __init xen_build_dynamic_phys_to_machine(void)
366{
367 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
368 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
369 unsigned long pfn;
370
371 xen_max_p2m_pfn = max_pfn;
372
373 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
374 p2m_init(p2m_missing);
375
376 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
377 p2m_mid_init(p2m_mid_missing);
378
379 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
380 p2m_top_init(p2m_top);
381
382 /*
383 * The domain builder gives us a pre-constructed p2m array in
384 * mfn_list for all the pages initially given to us, so we just
385 * need to graft that into our tree structure.
386 */
387 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
388 unsigned topidx = p2m_top_index(pfn);
389 unsigned mididx = p2m_mid_index(pfn);
390
391 if (p2m_top[topidx] == p2m_mid_missing) {
392 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
393 p2m_mid_init(mid);
394
395 p2m_top[topidx] = mid;
396 }
397
398 p2m_top[topidx][mididx] = &mfn_list[pfn];
399 }
400}
401
402unsigned long get_phys_to_machine(unsigned long pfn)
403{
404 unsigned topidx, mididx, idx;
405
406 if (unlikely(pfn >= MAX_P2M_PFN))
407 return INVALID_P2M_ENTRY;
408
409 topidx = p2m_top_index(pfn);
410 mididx = p2m_mid_index(pfn);
411 idx = p2m_index(pfn);
412
413 return p2m_top[topidx][mididx][idx];
414}
415EXPORT_SYMBOL_GPL(get_phys_to_machine);
416
417static void *alloc_p2m_page(void)
418{
419 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
420}
421
422static void free_p2m_page(void *p)
423{
424 free_page((unsigned long)p);
425}
426
427/*
428 * Fully allocate the p2m structure for a given pfn. We need to check
429 * that both the top and mid levels are allocated, and make sure the
430 * parallel mfn tree is kept in sync. We may race with other cpus, so
431 * the new pages are installed with cmpxchg; if we lose the race then
432 * simply free the page we allocated and use the one that's there.
433 */
434static bool alloc_p2m(unsigned long pfn)
435{
436 unsigned topidx, mididx;
437 unsigned long ***top_p, **mid;
438 unsigned long *top_mfn_p, *mid_mfn;
439
440 topidx = p2m_top_index(pfn);
441 mididx = p2m_mid_index(pfn);
442
443 top_p = &p2m_top[topidx];
444 mid = *top_p;
445
446 if (mid == p2m_mid_missing) {
447 /* Mid level is missing, allocate a new one */
448 mid = alloc_p2m_page();
449 if (!mid)
450 return false;
451
452 p2m_mid_init(mid);
453
454 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
455 free_p2m_page(mid);
456 }
457
458 top_mfn_p = &p2m_top_mfn[topidx];
459 mid_mfn = p2m_top_mfn_p[topidx];
460
461 BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
462
463 if (mid_mfn == p2m_mid_missing_mfn) {
464 /* Separately check the mid mfn level */
465 unsigned long missing_mfn;
466 unsigned long mid_mfn_mfn;
467
468 mid_mfn = alloc_p2m_page();
469 if (!mid_mfn)
470 return false;
471
472 p2m_mid_mfn_init(mid_mfn);
473
474 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
475 mid_mfn_mfn = virt_to_mfn(mid_mfn);
476 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
477 free_p2m_page(mid_mfn);
478 else
479 p2m_top_mfn_p[topidx] = mid_mfn;
480 }
481
482 if (p2m_top[topidx][mididx] == p2m_missing) {
483 /* p2m leaf page is missing */
484 unsigned long *p2m;
485
486 p2m = alloc_p2m_page();
487 if (!p2m)
488 return false;
489
490 p2m_init(p2m);
491
492 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
493 free_p2m_page(p2m);
494 else
495 mid_mfn[mididx] = virt_to_mfn(p2m);
496 }
497
498 return true;
499}
500
501/* Try to install p2m mapping; fail if intermediate bits missing */
502bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
503{
504 unsigned topidx, mididx, idx;
505
506 if (unlikely(pfn >= MAX_P2M_PFN)) {
507 BUG_ON(mfn != INVALID_P2M_ENTRY);
508 return true;
509 }
510
511 topidx = p2m_top_index(pfn);
512 mididx = p2m_mid_index(pfn);
513 idx = p2m_index(pfn);
514
515 if (p2m_top[topidx][mididx] == p2m_missing)
516 return mfn == INVALID_P2M_ENTRY;
517
518 p2m_top[topidx][mididx][idx] = mfn;
519
520 return true;
521}
522
523bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
524{
525 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
526 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
527 return true;
528 }
529
530 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
531 if (!alloc_p2m(pfn))
532 return false;
533
534 if (!__set_phys_to_machine(pfn, mfn))
535 return false;
536 }
537
538 return true;
539}
540
541unsigned long arbitrary_virt_to_mfn(void *vaddr) 176unsigned long arbitrary_virt_to_mfn(void *vaddr)
542{ 177{
543 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); 178 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
@@ -566,6 +201,7 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
566 offset = address & ~PAGE_MASK; 201 offset = address & ~PAGE_MASK;
567 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); 202 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
568} 203}
204EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
569 205
570void make_lowmem_page_readonly(void *vaddr) 206void make_lowmem_page_readonly(void *vaddr)
571{ 207{
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
new file mode 100644
index 00000000000..fd12d7ce7ff
--- /dev/null
+++ b/arch/x86/xen/p2m.c
@@ -0,0 +1,522 @@
1/*
2 * Xen leaves the responsibility for maintaining p2m mappings to the
3 * guests themselves, but it must also access and update the p2m array
4 * during suspend/resume when all the pages are reallocated.
5 *
6 * The p2m table is logically a flat array, but we implement it as a
7 * three-level tree to allow the address space to be sparse.
8 *
9 * Xen
10 * |
11 * p2m_top p2m_top_mfn
12 * / \ / \
13 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
14 * / \ / \ / /
15 * p2m p2m p2m p2m p2m p2m p2m ...
16 *
17 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
18 *
19 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
20 * maximum representable pseudo-physical address space is:
21 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
22 *
23 * P2M_PER_PAGE depends on the architecture, as a mfn is always
24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25 * 512 and 1024 entries respectively.
26 */
27
28#include <linux/init.h>
29#include <linux/module.h>
30#include <linux/list.h>
31#include <linux/hash.h>
32#include <linux/sched.h>
33
34#include <asm/cache.h>
35#include <asm/setup.h>
36
37#include <asm/xen/page.h>
38#include <asm/xen/hypercall.h>
39#include <asm/xen/hypervisor.h>
40
41#include "xen-ops.h"
42
43static void __init m2p_override_init(void);
44
45unsigned long xen_max_p2m_pfn __read_mostly;
46
47#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
48#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
49#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
50
51#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
52
53/* Placeholders for holes in the address space */
54static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
55static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
56static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
57
58static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
59static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
60static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
61
62RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
63RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
64
65static inline unsigned p2m_top_index(unsigned long pfn)
66{
67 BUG_ON(pfn >= MAX_P2M_PFN);
68 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
69}
70
71static inline unsigned p2m_mid_index(unsigned long pfn)
72{
73 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
74}
75
76static inline unsigned p2m_index(unsigned long pfn)
77{
78 return pfn % P2M_PER_PAGE;
79}
80
81static void p2m_top_init(unsigned long ***top)
82{
83 unsigned i;
84
85 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
86 top[i] = p2m_mid_missing;
87}
88
89static void p2m_top_mfn_init(unsigned long *top)
90{
91 unsigned i;
92
93 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
94 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
95}
96
97static void p2m_top_mfn_p_init(unsigned long **top)
98{
99 unsigned i;
100
101 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
102 top[i] = p2m_mid_missing_mfn;
103}
104
105static void p2m_mid_init(unsigned long **mid)
106{
107 unsigned i;
108
109 for (i = 0; i < P2M_MID_PER_PAGE; i++)
110 mid[i] = p2m_missing;
111}
112
113static void p2m_mid_mfn_init(unsigned long *mid)
114{
115 unsigned i;
116
117 for (i = 0; i < P2M_MID_PER_PAGE; i++)
118 mid[i] = virt_to_mfn(p2m_missing);
119}
120
121static void p2m_init(unsigned long *p2m)
122{
123 unsigned i;
124
125 for (i = 0; i < P2M_MID_PER_PAGE; i++)
126 p2m[i] = INVALID_P2M_ENTRY;
127}
128
129/*
130 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
131 *
132 * This is called both at boot time, and after resuming from suspend:
133 * - At boot time we're called very early, and must use extend_brk()
134 * to allocate memory.
135 *
136 * - After resume we're called from within stop_machine, but the mfn
137 * tree should alreay be completely allocated.
138 */
139void xen_build_mfn_list_list(void)
140{
141 unsigned long pfn;
142
143 /* Pre-initialize p2m_top_mfn to be completely missing */
144 if (p2m_top_mfn == NULL) {
145 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
146 p2m_mid_mfn_init(p2m_mid_missing_mfn);
147
148 p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
149 p2m_top_mfn_p_init(p2m_top_mfn_p);
150
151 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
152 p2m_top_mfn_init(p2m_top_mfn);
153 } else {
154 /* Reinitialise, mfn's all change after migration */
155 p2m_mid_mfn_init(p2m_mid_missing_mfn);
156 }
157
158 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
159 unsigned topidx = p2m_top_index(pfn);
160 unsigned mididx = p2m_mid_index(pfn);
161 unsigned long **mid;
162 unsigned long *mid_mfn_p;
163
164 mid = p2m_top[topidx];
165 mid_mfn_p = p2m_top_mfn_p[topidx];
166
167 /* Don't bother allocating any mfn mid levels if
168 * they're just missing, just update the stored mfn,
169 * since all could have changed over a migrate.
170 */
171 if (mid == p2m_mid_missing) {
172 BUG_ON(mididx);
173 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
174 p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
175 pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
176 continue;
177 }
178
179 if (mid_mfn_p == p2m_mid_missing_mfn) {
180 /*
181 * XXX boot-time only! We should never find
182 * missing parts of the mfn tree after
183 * runtime. extend_brk() will BUG if we call
184 * it too late.
185 */
186 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
187 p2m_mid_mfn_init(mid_mfn_p);
188
189 p2m_top_mfn_p[topidx] = mid_mfn_p;
190 }
191
192 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
193 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
194 }
195}
196
197void xen_setup_mfn_list_list(void)
198{
199 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
200
201 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
202 virt_to_mfn(p2m_top_mfn);
203 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
204}
205
206/* Set up p2m_top to point to the domain-builder provided p2m pages */
207void __init xen_build_dynamic_phys_to_machine(void)
208{
209 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
210 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
211 unsigned long pfn;
212
213 xen_max_p2m_pfn = max_pfn;
214
215 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
216 p2m_init(p2m_missing);
217
218 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
219 p2m_mid_init(p2m_mid_missing);
220
221 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
222 p2m_top_init(p2m_top);
223
224 /*
225 * The domain builder gives us a pre-constructed p2m array in
226 * mfn_list for all the pages initially given to us, so we just
227 * need to graft that into our tree structure.
228 */
229 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
230 unsigned topidx = p2m_top_index(pfn);
231 unsigned mididx = p2m_mid_index(pfn);
232
233 if (p2m_top[topidx] == p2m_mid_missing) {
234 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
235 p2m_mid_init(mid);
236
237 p2m_top[topidx] = mid;
238 }
239
240 /*
241 * As long as the mfn_list has enough entries to completely
242 * fill a p2m page, pointing into the array is ok. But if
243 * not the entries beyond the last pfn will be undefined.
244 */
245 if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
246 unsigned long p2midx;
247
248 p2midx = max_pfn % P2M_PER_PAGE;
249 for ( ; p2midx < P2M_PER_PAGE; p2midx++)
250 mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
251 }
252 p2m_top[topidx][mididx] = &mfn_list[pfn];
253 }
254
255 m2p_override_init();
256}
257
258unsigned long get_phys_to_machine(unsigned long pfn)
259{
260 unsigned topidx, mididx, idx;
261
262 if (unlikely(pfn >= MAX_P2M_PFN))
263 return INVALID_P2M_ENTRY;
264
265 topidx = p2m_top_index(pfn);
266 mididx = p2m_mid_index(pfn);
267 idx = p2m_index(pfn);
268
269 return p2m_top[topidx][mididx][idx];
270}
271EXPORT_SYMBOL_GPL(get_phys_to_machine);
272
273static void *alloc_p2m_page(void)
274{
275 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
276}
277
278static void free_p2m_page(void *p)
279{
280 free_page((unsigned long)p);
281}
282
283/*
284 * Fully allocate the p2m structure for a given pfn. We need to check
285 * that both the top and mid levels are allocated, and make sure the
286 * parallel mfn tree is kept in sync. We may race with other cpus, so
287 * the new pages are installed with cmpxchg; if we lose the race then
288 * simply free the page we allocated and use the one that's there.
289 */
290static bool alloc_p2m(unsigned long pfn)
291{
292 unsigned topidx, mididx;
293 unsigned long ***top_p, **mid;
294 unsigned long *top_mfn_p, *mid_mfn;
295
296 topidx = p2m_top_index(pfn);
297 mididx = p2m_mid_index(pfn);
298
299 top_p = &p2m_top[topidx];
300 mid = *top_p;
301
302 if (mid == p2m_mid_missing) {
303 /* Mid level is missing, allocate a new one */
304 mid = alloc_p2m_page();
305 if (!mid)
306 return false;
307
308 p2m_mid_init(mid);
309
310 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
311 free_p2m_page(mid);
312 }
313
314 top_mfn_p = &p2m_top_mfn[topidx];
315 mid_mfn = p2m_top_mfn_p[topidx];
316
317 BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
318
319 if (mid_mfn == p2m_mid_missing_mfn) {
320 /* Separately check the mid mfn level */
321 unsigned long missing_mfn;
322 unsigned long mid_mfn_mfn;
323
324 mid_mfn = alloc_p2m_page();
325 if (!mid_mfn)
326 return false;
327
328 p2m_mid_mfn_init(mid_mfn);
329
330 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
331 mid_mfn_mfn = virt_to_mfn(mid_mfn);
332 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
333 free_p2m_page(mid_mfn);
334 else
335 p2m_top_mfn_p[topidx] = mid_mfn;
336 }
337
338 if (p2m_top[topidx][mididx] == p2m_missing) {
339 /* p2m leaf page is missing */
340 unsigned long *p2m;
341
342 p2m = alloc_p2m_page();
343 if (!p2m)
344 return false;
345
346 p2m_init(p2m);
347
348 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
349 free_p2m_page(p2m);
350 else
351 mid_mfn[mididx] = virt_to_mfn(p2m);
352 }
353
354 return true;
355}
356
357/* Try to install p2m mapping; fail if intermediate bits missing */
358bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
359{
360 unsigned topidx, mididx, idx;
361
362 if (unlikely(pfn >= MAX_P2M_PFN)) {
363 BUG_ON(mfn != INVALID_P2M_ENTRY);
364 return true;
365 }
366
367 topidx = p2m_top_index(pfn);
368 mididx = p2m_mid_index(pfn);
369 idx = p2m_index(pfn);
370
371 if (p2m_top[topidx][mididx] == p2m_missing)
372 return mfn == INVALID_P2M_ENTRY;
373
374 p2m_top[topidx][mididx][idx] = mfn;
375
376 return true;
377}
378
379bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
380{
381 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
382 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
383 return true;
384 }
385
386 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
387 if (!alloc_p2m(pfn))
388 return false;
389
390 if (!__set_phys_to_machine(pfn, mfn))
391 return false;
392 }
393
394 return true;
395}
396
397#define M2P_OVERRIDE_HASH_SHIFT 10
398#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
399
400static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
401static DEFINE_SPINLOCK(m2p_override_lock);
402
403static void __init m2p_override_init(void)
404{
405 unsigned i;
406
407 m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
408 sizeof(unsigned long));
409
410 for (i = 0; i < M2P_OVERRIDE_HASH; i++)
411 INIT_LIST_HEAD(&m2p_overrides[i]);
412}
413
414static unsigned long mfn_hash(unsigned long mfn)
415{
416 return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
417}
418
419/* Add an MFN override for a particular page */
420int m2p_add_override(unsigned long mfn, struct page *page)
421{
422 unsigned long flags;
423 unsigned long pfn;
424 unsigned long address;
425 unsigned level;
426 pte_t *ptep = NULL;
427
428 pfn = page_to_pfn(page);
429 if (!PageHighMem(page)) {
430 address = (unsigned long)__va(pfn << PAGE_SHIFT);
431 ptep = lookup_address(address, &level);
432
433 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
434 "m2p_add_override: pfn %lx not mapped", pfn))
435 return -EINVAL;
436 }
437
438 page->private = mfn;
439 page->index = pfn_to_mfn(pfn);
440
441 __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
442 if (!PageHighMem(page))
443 /* Just zap old mapping for now */
444 pte_clear(&init_mm, address, ptep);
445
446 spin_lock_irqsave(&m2p_override_lock, flags);
447 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
448 spin_unlock_irqrestore(&m2p_override_lock, flags);
449
450 return 0;
451}
452
453int m2p_remove_override(struct page *page)
454{
455 unsigned long flags;
456 unsigned long mfn;
457 unsigned long pfn;
458 unsigned long address;
459 unsigned level;
460 pte_t *ptep = NULL;
461
462 pfn = page_to_pfn(page);
463 mfn = get_phys_to_machine(pfn);
464 if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
465 return -EINVAL;
466
467 if (!PageHighMem(page)) {
468 address = (unsigned long)__va(pfn << PAGE_SHIFT);
469 ptep = lookup_address(address, &level);
470
471 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
472 "m2p_remove_override: pfn %lx not mapped", pfn))
473 return -EINVAL;
474 }
475
476 spin_lock_irqsave(&m2p_override_lock, flags);
477 list_del(&page->lru);
478 spin_unlock_irqrestore(&m2p_override_lock, flags);
479 __set_phys_to_machine(pfn, page->index);
480
481 if (!PageHighMem(page))
482 set_pte_at(&init_mm, address, ptep,
483 pfn_pte(pfn, PAGE_KERNEL));
484 /* No tlb flush necessary because the caller already
485 * left the pte unmapped. */
486
487 return 0;
488}
489
490struct page *m2p_find_override(unsigned long mfn)
491{
492 unsigned long flags;
493 struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
494 struct page *p, *ret;
495
496 ret = NULL;
497
498 spin_lock_irqsave(&m2p_override_lock, flags);
499
500 list_for_each_entry(p, bucket, lru) {
501 if (p->private == mfn) {
502 ret = p;
503 break;
504 }
505 }
506
507 spin_unlock_irqrestore(&m2p_override_lock, flags);
508
509 return ret;
510}
511
512unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
513{
514 struct page *p = m2p_find_override(mfn);
515 unsigned long ret = pfn;
516
517 if (p)
518 ret = page_to_pfn(p);
519
520 return ret;
521}
522EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index b5a7f928234..a8a66a50d44 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -179,8 +179,13 @@ char * __init xen_memory_setup(void)
179 e820.nr_map = 0; 179 e820.nr_map = 0;
180 xen_extra_mem_start = mem_end; 180 xen_extra_mem_start = mem_end;
181 for (i = 0; i < memmap.nr_entries; i++) { 181 for (i = 0; i < memmap.nr_entries; i++) {
182 unsigned long long end = map[i].addr + map[i].size; 182 unsigned long long end;
183 183
184 /* Guard against non-page aligned E820 entries. */
185 if (map[i].type == E820_RAM)
186 map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE;
187
188 end = map[i].addr + map[i].size;
184 if (map[i].type == E820_RAM && end > mem_end) { 189 if (map[i].type == E820_RAM && end > mem_end) {
185 /* RAM off the end - may be partially included */ 190 /* RAM off the end - may be partially included */
186 u64 delta = min(map[i].size, end - mem_end); 191 u64 delta = min(map[i].size, end - mem_end);
@@ -350,6 +355,7 @@ void __init xen_arch_setup(void)
350 boot_cpu_data.hlt_works_ok = 1; 355 boot_cpu_data.hlt_works_ok = 1;
351#endif 356#endif
352 pm_idle = default_idle; 357 pm_idle = default_idle;
358 boot_option_idle_override = IDLE_HALT;
353 359
354 fiddle_vdso(); 360 fiddle_vdso();
355} 361}