diff options
Diffstat (limited to 'arch/x86')
54 files changed, 1160 insertions, 541 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b66f2102c35d..1401d4f0ed48 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -24,6 +24,7 @@ config X86 | |||
24 | select HAVE_UNSTABLE_SCHED_CLOCK | 24 | select HAVE_UNSTABLE_SCHED_CLOCK |
25 | select HAVE_IDE | 25 | select HAVE_IDE |
26 | select HAVE_OPROFILE | 26 | select HAVE_OPROFILE |
27 | select HAVE_PERF_COUNTERS if (!M386 && !M486) | ||
27 | select HAVE_IOREMAP_PROT | 28 | select HAVE_IOREMAP_PROT |
28 | select HAVE_KPROBES | 29 | select HAVE_KPROBES |
29 | select ARCH_WANT_OPTIONAL_GPIOLIB | 30 | select ARCH_WANT_OPTIONAL_GPIOLIB |
@@ -746,7 +747,6 @@ config X86_UP_IOAPIC | |||
746 | config X86_LOCAL_APIC | 747 | config X86_LOCAL_APIC |
747 | def_bool y | 748 | def_bool y |
748 | depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC | 749 | depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC |
749 | select HAVE_PERF_COUNTERS if (!M386 && !M486) | ||
750 | 750 | ||
751 | config X86_IO_APIC | 751 | config X86_IO_APIC |
752 | def_bool y | 752 | def_bool y |
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index e2ff504b4ddc..f8ed0658404c 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile | |||
@@ -4,7 +4,7 @@ | |||
4 | # create a compressed vmlinux image from the original vmlinux | 4 | # create a compressed vmlinux image from the original vmlinux |
5 | # | 5 | # |
6 | 6 | ||
7 | targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o | 7 | targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o |
8 | 8 | ||
9 | KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 | 9 | KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 |
10 | KBUILD_CFLAGS += -fno-strict-aliasing -fPIC | 10 | KBUILD_CFLAGS += -fno-strict-aliasing -fPIC |
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index edc90f23e708..8406ed7f9926 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h | |||
@@ -33,7 +33,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); | |||
33 | #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ | 33 | #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ |
34 | efi_call_virt(f, a1, a2, a3, a4, a5, a6) | 34 | efi_call_virt(f, a1, a2, a3, a4, a5, a6) |
35 | 35 | ||
36 | #define efi_ioremap(addr, size) ioremap_cache(addr, size) | 36 | #define efi_ioremap(addr, size, type) ioremap_cache(addr, size) |
37 | 37 | ||
38 | #else /* !CONFIG_X86_32 */ | 38 | #else /* !CONFIG_X86_32 */ |
39 | 39 | ||
@@ -84,7 +84,8 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, | |||
84 | efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ | 84 | efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ |
85 | (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) | 85 | (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) |
86 | 86 | ||
87 | extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size); | 87 | extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, |
88 | u32 type); | ||
88 | 89 | ||
89 | #endif /* CONFIG_X86_32 */ | 90 | #endif /* CONFIG_X86_32 */ |
90 | 91 | ||
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 2bdab21f0898..c6ccbe7e81ad 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h | |||
@@ -12,9 +12,15 @@ static inline unsigned long native_save_fl(void) | |||
12 | { | 12 | { |
13 | unsigned long flags; | 13 | unsigned long flags; |
14 | 14 | ||
15 | /* | ||
16 | * Note: this needs to be "=r" not "=rm", because we have the | ||
17 | * stack offset from what gcc expects at the time the "pop" is | ||
18 | * executed, and so a memory reference with respect to the stack | ||
19 | * would end up using the wrong address. | ||
20 | */ | ||
15 | asm volatile("# __raw_save_flags\n\t" | 21 | asm volatile("# __raw_save_flags\n\t" |
16 | "pushf ; pop %0" | 22 | "pushf ; pop %0" |
17 | : "=g" (flags) | 23 | : "=r" (flags) |
18 | : /* no input */ | 24 | : /* no input */ |
19 | : "memory"); | 25 | : "memory"); |
20 | 26 | ||
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 313389cd50d2..5136dad57cbb 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h | |||
@@ -17,8 +17,7 @@ | |||
17 | /* Pages for switcher itself, then two pages per cpu */ | 17 | /* Pages for switcher itself, then two pages per cpu */ |
18 | #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) | 18 | #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) |
19 | 19 | ||
20 | /* We map at -4M (-2M when PAE is activated) for ease of mapping | 20 | /* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */ |
21 | * into the guest (one PTE page). */ | ||
22 | #ifdef CONFIG_X86_PAE | 21 | #ifdef CONFIG_X86_PAE |
23 | #define SWITCHER_ADDR 0xFFE00000 | 22 | #define SWITCHER_ADDR 0xFFE00000 |
24 | #else | 23 | #else |
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index 33600a66755f..ba0eed8aa1a6 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h | |||
@@ -30,27 +30,27 @@ | |||
30 | #include <asm/hw_irq.h> | 30 | #include <asm/hw_irq.h> |
31 | #include <asm/kvm_para.h> | 31 | #include <asm/kvm_para.h> |
32 | 32 | ||
33 | /*G:030 But first, how does our Guest contact the Host to ask for privileged | 33 | /*G:030 |
34 | * But first, how does our Guest contact the Host to ask for privileged | ||
34 | * operations? There are two ways: the direct way is to make a "hypercall", | 35 | * operations? There are two ways: the direct way is to make a "hypercall", |
35 | * to make requests of the Host Itself. | 36 | * to make requests of the Host Itself. |
36 | * | 37 | * |
37 | * We use the KVM hypercall mechanism. Seventeen hypercalls are | 38 | * We use the KVM hypercall mechanism, though completely different hypercall |
38 | * available: the hypercall number is put in the %eax register, and the | 39 | * numbers. Seventeen hypercalls are available: the hypercall number is put in |
39 | * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. | 40 | * the %eax register, and the arguments (when required) are placed in %ebx, |
40 | * If a return value makes sense, it's returned in %eax. | 41 | * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. |
41 | * | 42 | * |
42 | * Grossly invalid calls result in Sudden Death at the hands of the vengeful | 43 | * Grossly invalid calls result in Sudden Death at the hands of the vengeful |
43 | * Host, rather than returning failure. This reflects Winston Churchill's | 44 | * Host, rather than returning failure. This reflects Winston Churchill's |
44 | * definition of a gentleman: "someone who is only rude intentionally". */ | 45 | * definition of a gentleman: "someone who is only rude intentionally". |
45 | /*:*/ | 46 | :*/ |
46 | 47 | ||
47 | /* Can't use our min() macro here: needs to be a constant */ | 48 | /* Can't use our min() macro here: needs to be a constant */ |
48 | #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) | 49 | #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) |
49 | 50 | ||
50 | #define LHCALL_RING_SIZE 64 | 51 | #define LHCALL_RING_SIZE 64 |
51 | struct hcall_args { | 52 | struct hcall_args { |
52 | /* These map directly onto eax, ebx, ecx, edx and esi | 53 | /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ |
53 | * in struct lguest_regs */ | ||
54 | unsigned long arg0, arg1, arg2, arg3, arg4; | 54 | unsigned long arg0, arg1, arg2, arg3, arg4; |
55 | }; | 55 | }; |
56 | 56 | ||
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index dd14c54ac718..0e8c2a0fd922 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h | |||
@@ -46,7 +46,13 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte) | |||
46 | __free_page(pte); | 46 | __free_page(pte); |
47 | } | 47 | } |
48 | 48 | ||
49 | extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte); | 49 | extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte); |
50 | |||
51 | static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, | ||
52 | unsigned long address) | ||
53 | { | ||
54 | ___pte_free_tlb(tlb, pte); | ||
55 | } | ||
50 | 56 | ||
51 | static inline void pmd_populate_kernel(struct mm_struct *mm, | 57 | static inline void pmd_populate_kernel(struct mm_struct *mm, |
52 | pmd_t *pmd, pte_t *pte) | 58 | pmd_t *pmd, pte_t *pte) |
@@ -78,7 +84,13 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) | |||
78 | free_page((unsigned long)pmd); | 84 | free_page((unsigned long)pmd); |
79 | } | 85 | } |
80 | 86 | ||
81 | extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); | 87 | extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); |
88 | |||
89 | static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, | ||
90 | unsigned long adddress) | ||
91 | { | ||
92 | ___pmd_free_tlb(tlb, pmd); | ||
93 | } | ||
82 | 94 | ||
83 | #ifdef CONFIG_X86_PAE | 95 | #ifdef CONFIG_X86_PAE |
84 | extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); | 96 | extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); |
@@ -108,7 +120,14 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) | |||
108 | free_page((unsigned long)pud); | 120 | free_page((unsigned long)pud); |
109 | } | 121 | } |
110 | 122 | ||
111 | extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); | 123 | extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); |
124 | |||
125 | static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, | ||
126 | unsigned long address) | ||
127 | { | ||
128 | ___pud_free_tlb(tlb, pud); | ||
129 | } | ||
130 | |||
112 | #endif /* PAGETABLE_LEVELS > 3 */ | 131 | #endif /* PAGETABLE_LEVELS > 3 */ |
113 | #endif /* PAGETABLE_LEVELS > 2 */ | 132 | #endif /* PAGETABLE_LEVELS > 2 */ |
114 | 133 | ||
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3cc06e3fceb8..16748077559a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define _ASM_X86_PGTABLE_H | 2 | #define _ASM_X86_PGTABLE_H |
3 | 3 | ||
4 | #include <asm/page.h> | 4 | #include <asm/page.h> |
5 | #include <asm/e820.h> | ||
5 | 6 | ||
6 | #include <asm/pgtable_types.h> | 7 | #include <asm/pgtable_types.h> |
7 | 8 | ||
@@ -269,10 +270,17 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) | |||
269 | 270 | ||
270 | #define canon_pgprot(p) __pgprot(massage_pgprot(p)) | 271 | #define canon_pgprot(p) __pgprot(massage_pgprot(p)) |
271 | 272 | ||
272 | static inline int is_new_memtype_allowed(unsigned long flags, | 273 | static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, |
273 | unsigned long new_flags) | 274 | unsigned long flags, |
275 | unsigned long new_flags) | ||
274 | { | 276 | { |
275 | /* | 277 | /* |
278 | * PAT type is always WB for ISA. So no need to check. | ||
279 | */ | ||
280 | if (is_ISA_range(paddr, paddr + size - 1)) | ||
281 | return 1; | ||
282 | |||
283 | /* | ||
276 | * Certain new memtypes are not allowed with certain | 284 | * Certain new memtypes are not allowed with certain |
277 | * requested memtype: | 285 | * requested memtype: |
278 | * - request is uncached, return cannot be write-back | 286 | * - request is uncached, return cannot be write-back |
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 20e6a795e160..d2c6c930b491 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h | |||
@@ -212,9 +212,9 @@ extern int __get_user_bad(void); | |||
212 | : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") | 212 | : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") |
213 | #else | 213 | #else |
214 | #define __put_user_asm_u64(x, ptr, retval, errret) \ | 214 | #define __put_user_asm_u64(x, ptr, retval, errret) \ |
215 | __put_user_asm(x, ptr, retval, "q", "", "Zr", errret) | 215 | __put_user_asm(x, ptr, retval, "q", "", "er", errret) |
216 | #define __put_user_asm_ex_u64(x, addr) \ | 216 | #define __put_user_asm_ex_u64(x, addr) \ |
217 | __put_user_asm_ex(x, addr, "q", "", "Zr") | 217 | __put_user_asm_ex(x, addr, "q", "", "er") |
218 | #define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) | 218 | #define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) |
219 | #endif | 219 | #endif |
220 | 220 | ||
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 8cc687326eb8..db24b215fc50 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h | |||
@@ -88,11 +88,11 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size) | |||
88 | ret, "l", "k", "ir", 4); | 88 | ret, "l", "k", "ir", 4); |
89 | return ret; | 89 | return ret; |
90 | case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, | 90 | case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, |
91 | ret, "q", "", "ir", 8); | 91 | ret, "q", "", "er", 8); |
92 | return ret; | 92 | return ret; |
93 | case 10: | 93 | case 10: |
94 | __put_user_asm(*(u64 *)src, (u64 __user *)dst, | 94 | __put_user_asm(*(u64 *)src, (u64 __user *)dst, |
95 | ret, "q", "", "ir", 10); | 95 | ret, "q", "", "er", 10); |
96 | if (unlikely(ret)) | 96 | if (unlikely(ret)) |
97 | return ret; | 97 | return ret; |
98 | asm("":::"memory"); | 98 | asm("":::"memory"); |
@@ -101,12 +101,12 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size) | |||
101 | return ret; | 101 | return ret; |
102 | case 16: | 102 | case 16: |
103 | __put_user_asm(*(u64 *)src, (u64 __user *)dst, | 103 | __put_user_asm(*(u64 *)src, (u64 __user *)dst, |
104 | ret, "q", "", "ir", 16); | 104 | ret, "q", "", "er", 16); |
105 | if (unlikely(ret)) | 105 | if (unlikely(ret)) |
106 | return ret; | 106 | return ret; |
107 | asm("":::"memory"); | 107 | asm("":::"memory"); |
108 | __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, | 108 | __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, |
109 | ret, "q", "", "ir", 8); | 109 | ret, "q", "", "er", 8); |
110 | return ret; | 110 | return ret; |
111 | default: | 111 | default: |
112 | return copy_user_generic((__force void *)dst, src, size); | 112 | return copy_user_generic((__force void *)dst, src, size); |
@@ -157,7 +157,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) | |||
157 | ret, "q", "", "=r", 8); | 157 | ret, "q", "", "=r", 8); |
158 | if (likely(!ret)) | 158 | if (likely(!ret)) |
159 | __put_user_asm(tmp, (u64 __user *)dst, | 159 | __put_user_asm(tmp, (u64 __user *)dst, |
160 | ret, "q", "", "ir", 8); | 160 | ret, "q", "", "er", 8); |
161 | return ret; | 161 | return ret; |
162 | } | 162 | } |
163 | default: | 163 | default: |
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index bddd44f2f0ab..80e2984f521c 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h | |||
@@ -133,7 +133,7 @@ struct bau_msg_payload { | |||
133 | * see table 4.2.3.0.1 in broacast_assist spec. | 133 | * see table 4.2.3.0.1 in broacast_assist spec. |
134 | */ | 134 | */ |
135 | struct bau_msg_header { | 135 | struct bau_msg_header { |
136 | unsigned int dest_subnodeid:6; /* must be zero */ | 136 | unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ |
137 | /* bits 5:0 */ | 137 | /* bits 5:0 */ |
138 | unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */ | 138 | unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */ |
139 | /* bits 20:6 */ /* first bit in node_map */ | 139 | /* bits 20:6 */ /* first bit in node_map */ |
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 341070f7ad5c..77a68505419a 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h | |||
@@ -175,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); | |||
175 | #define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) | 175 | #define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) |
176 | 176 | ||
177 | #define UV_GLOBAL_MMR64_PNODE_BITS(p) \ | 177 | #define UV_GLOBAL_MMR64_PNODE_BITS(p) \ |
178 | ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) | 178 | (((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) |
179 | 179 | ||
180 | #define UV_APIC_PNODE_SHIFT 6 | 180 | #define UV_APIC_PNODE_SHIFT 6 |
181 | 181 | ||
@@ -327,6 +327,7 @@ struct uv_blade_info { | |||
327 | unsigned short nr_possible_cpus; | 327 | unsigned short nr_possible_cpus; |
328 | unsigned short nr_online_cpus; | 328 | unsigned short nr_online_cpus; |
329 | unsigned short pnode; | 329 | unsigned short pnode; |
330 | short memory_nid; | ||
330 | }; | 331 | }; |
331 | extern struct uv_blade_info *uv_blade_info; | 332 | extern struct uv_blade_info *uv_blade_info; |
332 | extern short *uv_node_to_blade; | 333 | extern short *uv_node_to_blade; |
@@ -363,6 +364,12 @@ static inline int uv_blade_to_pnode(int bid) | |||
363 | return uv_blade_info[bid].pnode; | 364 | return uv_blade_info[bid].pnode; |
364 | } | 365 | } |
365 | 366 | ||
367 | /* Nid of memory node on blade. -1 if no blade-local memory */ | ||
368 | static inline int uv_blade_to_memory_nid(int bid) | ||
369 | { | ||
370 | return uv_blade_info[bid].memory_nid; | ||
371 | } | ||
372 | |||
366 | /* Determine the number of possible cpus on a blade */ | 373 | /* Determine the number of possible cpus on a blade */ |
367 | static inline int uv_blade_nr_possible_cpus(int bid) | 374 | static inline int uv_blade_nr_possible_cpus(int bid) |
368 | { | 375 | { |
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2284a4812b68..d2ed6c5ddc80 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c | |||
@@ -3793,6 +3793,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, | |||
3793 | mmr_pnode = uv_blade_to_pnode(mmr_blade); | 3793 | mmr_pnode = uv_blade_to_pnode(mmr_blade); |
3794 | uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); | 3794 | uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); |
3795 | 3795 | ||
3796 | if (cfg->move_in_progress) | ||
3797 | send_cleanup_vector(cfg); | ||
3798 | |||
3796 | return irq; | 3799 | return irq; |
3797 | } | 3800 | } |
3798 | 3801 | ||
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index dbf5445727a9..6ef00ba4c886 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c | |||
@@ -106,6 +106,9 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) | |||
106 | unsigned long mask = cpumask_bits(cpumask)[0]; | 106 | unsigned long mask = cpumask_bits(cpumask)[0]; |
107 | unsigned long flags; | 107 | unsigned long flags; |
108 | 108 | ||
109 | if (WARN_ONCE(!mask, "empty IPI mask")) | ||
110 | return; | ||
111 | |||
109 | local_irq_save(flags); | 112 | local_irq_save(flags); |
110 | WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); | 113 | WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); |
111 | __default_send_IPI_dest_field(mask, vector, apic->dest_logical); | 114 | __default_send_IPI_dest_field(mask, vector, apic->dest_logical); |
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 8e4cbb255c38..a5371ec36776 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c | |||
@@ -17,11 +17,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | |||
17 | return x2apic_enabled(); | 17 | return x2apic_enabled(); |
18 | } | 18 | } |
19 | 19 | ||
20 | /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ | 20 | /* |
21 | 21 | * need to use more than cpu 0, because we need more vectors when | |
22 | * MSI-X are used. | ||
23 | */ | ||
22 | static const struct cpumask *x2apic_target_cpus(void) | 24 | static const struct cpumask *x2apic_target_cpus(void) |
23 | { | 25 | { |
24 | return cpumask_of(0); | 26 | return cpu_online_mask; |
25 | } | 27 | } |
26 | 28 | ||
27 | /* | 29 | /* |
@@ -170,7 +172,7 @@ static unsigned long set_apic_id(unsigned int id) | |||
170 | 172 | ||
171 | static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) | 173 | static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) |
172 | { | 174 | { |
173 | return current_cpu_data.initial_apicid >> index_msb; | 175 | return initial_apicid >> index_msb; |
174 | } | 176 | } |
175 | 177 | ||
176 | static void x2apic_send_IPI_self(int vector) | 178 | static void x2apic_send_IPI_self(int vector) |
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index a284359627e7..a8989aadc99a 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c | |||
@@ -27,11 +27,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | |||
27 | return 0; | 27 | return 0; |
28 | } | 28 | } |
29 | 29 | ||
30 | /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ | 30 | /* |
31 | 31 | * need to use more than cpu 0, because we need more vectors when | |
32 | * MSI-X are used. | ||
33 | */ | ||
32 | static const struct cpumask *x2apic_target_cpus(void) | 34 | static const struct cpumask *x2apic_target_cpus(void) |
33 | { | 35 | { |
34 | return cpumask_of(0); | 36 | return cpu_online_mask; |
35 | } | 37 | } |
36 | 38 | ||
37 | static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) | 39 | static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) |
@@ -162,7 +164,7 @@ static unsigned long set_apic_id(unsigned int id) | |||
162 | 164 | ||
163 | static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) | 165 | static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) |
164 | { | 166 | { |
165 | return current_cpu_data.initial_apicid >> index_msb; | 167 | return initial_apicid >> index_msb; |
166 | } | 168 | } |
167 | 169 | ||
168 | static void x2apic_send_IPI_self(int vector) | 170 | static void x2apic_send_IPI_self(int vector) |
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 096d19aea2f7..601159374e87 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c | |||
@@ -46,7 +46,7 @@ static int early_get_nodeid(void) | |||
46 | return node_id.s.node_id; | 46 | return node_id.s.node_id; |
47 | } | 47 | } |
48 | 48 | ||
49 | static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | 49 | static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) |
50 | { | 50 | { |
51 | if (!strcmp(oem_id, "SGI")) { | 51 | if (!strcmp(oem_id, "SGI")) { |
52 | if (!strcmp(oem_table_id, "UVL")) | 52 | if (!strcmp(oem_table_id, "UVL")) |
@@ -253,7 +253,7 @@ static void uv_send_IPI_self(int vector) | |||
253 | apic_write(APIC_SELF_IPI, vector); | 253 | apic_write(APIC_SELF_IPI, vector); |
254 | } | 254 | } |
255 | 255 | ||
256 | struct apic apic_x2apic_uv_x = { | 256 | struct apic __refdata apic_x2apic_uv_x = { |
257 | 257 | ||
258 | .name = "UV large system", | 258 | .name = "UV large system", |
259 | .probe = NULL, | 259 | .probe = NULL, |
@@ -261,7 +261,7 @@ struct apic apic_x2apic_uv_x = { | |||
261 | .apic_id_registered = uv_apic_id_registered, | 261 | .apic_id_registered = uv_apic_id_registered, |
262 | 262 | ||
263 | .irq_delivery_mode = dest_Fixed, | 263 | .irq_delivery_mode = dest_Fixed, |
264 | .irq_dest_mode = 1, /* logical */ | 264 | .irq_dest_mode = 0, /* physical */ |
265 | 265 | ||
266 | .target_cpus = uv_target_cpus, | 266 | .target_cpus = uv_target_cpus, |
267 | .disable_esr = 0, | 267 | .disable_esr = 0, |
@@ -362,12 +362,6 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) | |||
362 | BUG(); | 362 | BUG(); |
363 | } | 363 | } |
364 | 364 | ||
365 | static __init void map_low_mmrs(void) | ||
366 | { | ||
367 | init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); | ||
368 | init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); | ||
369 | } | ||
370 | |||
371 | enum map_type {map_wb, map_uc}; | 365 | enum map_type {map_wb, map_uc}; |
372 | 366 | ||
373 | static __init void map_high(char *id, unsigned long base, int shift, | 367 | static __init void map_high(char *id, unsigned long base, int shift, |
@@ -395,26 +389,6 @@ static __init void map_gru_high(int max_pnode) | |||
395 | map_high("GRU", gru.s.base, shift, max_pnode, map_wb); | 389 | map_high("GRU", gru.s.base, shift, max_pnode, map_wb); |
396 | } | 390 | } |
397 | 391 | ||
398 | static __init void map_config_high(int max_pnode) | ||
399 | { | ||
400 | union uvh_rh_gam_cfg_overlay_config_mmr_u cfg; | ||
401 | int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT; | ||
402 | |||
403 | cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); | ||
404 | if (cfg.s.enable) | ||
405 | map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc); | ||
406 | } | ||
407 | |||
408 | static __init void map_mmr_high(int max_pnode) | ||
409 | { | ||
410 | union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; | ||
411 | int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; | ||
412 | |||
413 | mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); | ||
414 | if (mmr.s.enable) | ||
415 | map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); | ||
416 | } | ||
417 | |||
418 | static __init void map_mmioh_high(int max_pnode) | 392 | static __init void map_mmioh_high(int max_pnode) |
419 | { | 393 | { |
420 | union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; | 394 | union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; |
@@ -566,8 +540,6 @@ void __init uv_system_init(void) | |||
566 | unsigned long mmr_base, present, paddr; | 540 | unsigned long mmr_base, present, paddr; |
567 | unsigned short pnode_mask; | 541 | unsigned short pnode_mask; |
568 | 542 | ||
569 | map_low_mmrs(); | ||
570 | |||
571 | m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); | 543 | m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); |
572 | m_val = m_n_config.s.m_skt; | 544 | m_val = m_n_config.s.m_skt; |
573 | n_val = m_n_config.s.n_skt; | 545 | n_val = m_n_config.s.n_skt; |
@@ -591,6 +563,8 @@ void __init uv_system_init(void) | |||
591 | bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); | 563 | bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); |
592 | uv_blade_info = kmalloc(bytes, GFP_KERNEL); | 564 | uv_blade_info = kmalloc(bytes, GFP_KERNEL); |
593 | BUG_ON(!uv_blade_info); | 565 | BUG_ON(!uv_blade_info); |
566 | for (blade = 0; blade < uv_num_possible_blades(); blade++) | ||
567 | uv_blade_info[blade].memory_nid = -1; | ||
594 | 568 | ||
595 | get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); | 569 | get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); |
596 | 570 | ||
@@ -629,6 +603,9 @@ void __init uv_system_init(void) | |||
629 | lcpu = uv_blade_info[blade].nr_possible_cpus; | 603 | lcpu = uv_blade_info[blade].nr_possible_cpus; |
630 | uv_blade_info[blade].nr_possible_cpus++; | 604 | uv_blade_info[blade].nr_possible_cpus++; |
631 | 605 | ||
606 | /* Any node on the blade, else will contain -1. */ | ||
607 | uv_blade_info[blade].memory_nid = nid; | ||
608 | |||
632 | uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; | 609 | uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; |
633 | uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; | 610 | uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; |
634 | uv_cpu_hub_info(cpu)->m_val = m_val; | 611 | uv_cpu_hub_info(cpu)->m_val = m_val; |
@@ -662,11 +639,10 @@ void __init uv_system_init(void) | |||
662 | pnode = (paddr >> m_val) & pnode_mask; | 639 | pnode = (paddr >> m_val) & pnode_mask; |
663 | blade = boot_pnode_to_blade(pnode); | 640 | blade = boot_pnode_to_blade(pnode); |
664 | uv_node_to_blade[nid] = blade; | 641 | uv_node_to_blade[nid] = blade; |
642 | max_pnode = max(pnode, max_pnode); | ||
665 | } | 643 | } |
666 | 644 | ||
667 | map_gru_high(max_pnode); | 645 | map_gru_high(max_pnode); |
668 | map_mmr_high(max_pnode); | ||
669 | map_config_high(max_pnode); | ||
670 | map_mmioh_high(max_pnode); | 646 | map_mmioh_high(max_pnode); |
671 | 647 | ||
672 | uv_cpu_init(); | 648 | uv_cpu_init(); |
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 79302e9a33a4..442b5508893f 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c | |||
@@ -811,7 +811,7 @@ static int apm_do_idle(void) | |||
811 | u8 ret = 0; | 811 | u8 ret = 0; |
812 | int idled = 0; | 812 | int idled = 0; |
813 | int polling; | 813 | int polling; |
814 | int err; | 814 | int err = 0; |
815 | 815 | ||
816 | polling = !!(current_thread_info()->status & TS_POLLING); | 816 | polling = !!(current_thread_info()->status & TS_POLLING); |
817 | if (polling) { | 817 | if (polling) { |
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3efcb2b96a15..c1f253dac155 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
@@ -7,6 +7,10 @@ ifdef CONFIG_FUNCTION_TRACER | |||
7 | CFLAGS_REMOVE_common.o = -pg | 7 | CFLAGS_REMOVE_common.o = -pg |
8 | endif | 8 | endif |
9 | 9 | ||
10 | # Make sure load_percpu_segment has no stackprotector | ||
11 | nostackp := $(call cc-option, -fno-stack-protector) | ||
12 | CFLAGS_common.o := $(nostackp) | ||
13 | |||
10 | obj-y := intel_cacheinfo.o addon_cpuid_features.o | 14 | obj-y := intel_cacheinfo.o addon_cpuid_features.o |
11 | obj-y += proc.o capflags.o powerflags.o common.o | 15 | obj-y += proc.o capflags.o powerflags.o common.o |
12 | obj-y += vmware.o hypervisor.o | 16 | obj-y += vmware.o hypervisor.o |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 28e5f5956042..63fddcd082cd 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -356,7 +356,7 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) | |||
356 | #endif | 356 | #endif |
357 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) | 357 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) |
358 | /* check CPU config space for extended APIC ID */ | 358 | /* check CPU config space for extended APIC ID */ |
359 | if (c->x86 >= 0xf) { | 359 | if (cpu_has_apic && c->x86 >= 0xf) { |
360 | unsigned int val; | 360 | unsigned int val; |
361 | val = read_pci_config(0, 24, 0, 0x68); | 361 | val = read_pci_config(0, 24, 0, 0x68); |
362 | if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) | 362 | if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) |
@@ -400,6 +400,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
400 | level = cpuid_eax(1); | 400 | level = cpuid_eax(1); |
401 | if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) | 401 | if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) |
402 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | 402 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); |
403 | |||
404 | /* | ||
405 | * Some BIOSes incorrectly force this feature, but only K8 | ||
406 | * revision D (model = 0x14) and later actually support it. | ||
407 | */ | ||
408 | if (c->x86_model < 0x14) | ||
409 | clear_cpu_cap(c, X86_FEATURE_LAHF_LM); | ||
403 | } | 410 | } |
404 | if (c->x86 == 0x10 || c->x86 == 0x11) | 411 | if (c->x86 == 0x10 || c->x86 == 0x11) |
405 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | 412 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f1961c07af9a..5ce60a88027b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -59,7 +59,30 @@ void __init setup_cpu_local_masks(void) | |||
59 | alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); | 59 | alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); |
60 | } | 60 | } |
61 | 61 | ||
62 | static const struct cpu_dev *this_cpu __cpuinitdata; | 62 | static void __cpuinit default_init(struct cpuinfo_x86 *c) |
63 | { | ||
64 | #ifdef CONFIG_X86_64 | ||
65 | display_cacheinfo(c); | ||
66 | #else | ||
67 | /* Not much we can do here... */ | ||
68 | /* Check if at least it has cpuid */ | ||
69 | if (c->cpuid_level == -1) { | ||
70 | /* No cpuid. It must be an ancient CPU */ | ||
71 | if (c->x86 == 4) | ||
72 | strcpy(c->x86_model_id, "486"); | ||
73 | else if (c->x86 == 3) | ||
74 | strcpy(c->x86_model_id, "386"); | ||
75 | } | ||
76 | #endif | ||
77 | } | ||
78 | |||
79 | static const struct cpu_dev __cpuinitconst default_cpu = { | ||
80 | .c_init = default_init, | ||
81 | .c_vendor = "Unknown", | ||
82 | .c_x86_vendor = X86_VENDOR_UNKNOWN, | ||
83 | }; | ||
84 | |||
85 | static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; | ||
63 | 86 | ||
64 | DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { | 87 | DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { |
65 | #ifdef CONFIG_X86_64 | 88 | #ifdef CONFIG_X86_64 |
@@ -332,29 +355,6 @@ void switch_to_new_gdt(int cpu) | |||
332 | 355 | ||
333 | static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; | 356 | static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; |
334 | 357 | ||
335 | static void __cpuinit default_init(struct cpuinfo_x86 *c) | ||
336 | { | ||
337 | #ifdef CONFIG_X86_64 | ||
338 | display_cacheinfo(c); | ||
339 | #else | ||
340 | /* Not much we can do here... */ | ||
341 | /* Check if at least it has cpuid */ | ||
342 | if (c->cpuid_level == -1) { | ||
343 | /* No cpuid. It must be an ancient CPU */ | ||
344 | if (c->x86 == 4) | ||
345 | strcpy(c->x86_model_id, "486"); | ||
346 | else if (c->x86 == 3) | ||
347 | strcpy(c->x86_model_id, "386"); | ||
348 | } | ||
349 | #endif | ||
350 | } | ||
351 | |||
352 | static const struct cpu_dev __cpuinitconst default_cpu = { | ||
353 | .c_init = default_init, | ||
354 | .c_vendor = "Unknown", | ||
355 | .c_x86_vendor = X86_VENDOR_UNKNOWN, | ||
356 | }; | ||
357 | |||
358 | static void __cpuinit get_model_name(struct cpuinfo_x86 *c) | 358 | static void __cpuinit get_model_name(struct cpuinfo_x86 *c) |
359 | { | 359 | { |
360 | unsigned int *v; | 360 | unsigned int *v; |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 484c1e5f658e..01213048f62f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -1226,8 +1226,13 @@ static void mce_init(void) | |||
1226 | } | 1226 | } |
1227 | 1227 | ||
1228 | /* Add per CPU specific workarounds here */ | 1228 | /* Add per CPU specific workarounds here */ |
1229 | static void mce_cpu_quirks(struct cpuinfo_x86 *c) | 1229 | static int mce_cpu_quirks(struct cpuinfo_x86 *c) |
1230 | { | 1230 | { |
1231 | if (c->x86_vendor == X86_VENDOR_UNKNOWN) { | ||
1232 | pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); | ||
1233 | return -EOPNOTSUPP; | ||
1234 | } | ||
1235 | |||
1231 | /* This should be disabled by the BIOS, but isn't always */ | 1236 | /* This should be disabled by the BIOS, but isn't always */ |
1232 | if (c->x86_vendor == X86_VENDOR_AMD) { | 1237 | if (c->x86_vendor == X86_VENDOR_AMD) { |
1233 | if (c->x86 == 15 && banks > 4) { | 1238 | if (c->x86 == 15 && banks > 4) { |
@@ -1273,11 +1278,20 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) | |||
1273 | if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && | 1278 | if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && |
1274 | monarch_timeout < 0) | 1279 | monarch_timeout < 0) |
1275 | monarch_timeout = USEC_PER_SEC; | 1280 | monarch_timeout = USEC_PER_SEC; |
1281 | |||
1282 | /* | ||
1283 | * There are also broken BIOSes on some Pentium M and | ||
1284 | * earlier systems: | ||
1285 | */ | ||
1286 | if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) | ||
1287 | mce_bootlog = 0; | ||
1276 | } | 1288 | } |
1277 | if (monarch_timeout < 0) | 1289 | if (monarch_timeout < 0) |
1278 | monarch_timeout = 0; | 1290 | monarch_timeout = 0; |
1279 | if (mce_bootlog != 0) | 1291 | if (mce_bootlog != 0) |
1280 | mce_panic_timeout = 30; | 1292 | mce_panic_timeout = 30; |
1293 | |||
1294 | return 0; | ||
1281 | } | 1295 | } |
1282 | 1296 | ||
1283 | static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) | 1297 | static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) |
@@ -1338,11 +1352,10 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) | |||
1338 | if (!mce_available(c)) | 1352 | if (!mce_available(c)) |
1339 | return; | 1353 | return; |
1340 | 1354 | ||
1341 | if (mce_cap_init() < 0) { | 1355 | if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { |
1342 | mce_disabled = 1; | 1356 | mce_disabled = 1; |
1343 | return; | 1357 | return; |
1344 | } | 1358 | } |
1345 | mce_cpu_quirks(c); | ||
1346 | 1359 | ||
1347 | machine_check_vector = do_machine_check; | 1360 | machine_check_vector = do_machine_check; |
1348 | 1361 | ||
@@ -1692,17 +1705,15 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, | |||
1692 | const char *buf, size_t siz) | 1705 | const char *buf, size_t siz) |
1693 | { | 1706 | { |
1694 | char *p; | 1707 | char *p; |
1695 | int len; | ||
1696 | 1708 | ||
1697 | strncpy(mce_helper, buf, sizeof(mce_helper)); | 1709 | strncpy(mce_helper, buf, sizeof(mce_helper)); |
1698 | mce_helper[sizeof(mce_helper)-1] = 0; | 1710 | mce_helper[sizeof(mce_helper)-1] = 0; |
1699 | len = strlen(mce_helper); | ||
1700 | p = strchr(mce_helper, '\n'); | 1711 | p = strchr(mce_helper, '\n'); |
1701 | 1712 | ||
1702 | if (*p) | 1713 | if (p) |
1703 | *p = 0; | 1714 | *p = 0; |
1704 | 1715 | ||
1705 | return len; | 1716 | return strlen(mce_helper) + !!p; |
1706 | } | 1717 | } |
1707 | 1718 | ||
1708 | static ssize_t set_ignore_ce(struct sys_device *s, | 1719 | static ssize_t set_ignore_ce(struct sys_device *s, |
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index bff8dd191dd5..5957a93e5173 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c | |||
@@ -36,6 +36,7 @@ | |||
36 | 36 | ||
37 | static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; | 37 | static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; |
38 | static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); | 38 | static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); |
39 | static DEFINE_PER_CPU(bool, thermal_throttle_active); | ||
39 | 40 | ||
40 | static atomic_t therm_throt_en = ATOMIC_INIT(0); | 41 | static atomic_t therm_throt_en = ATOMIC_INIT(0); |
41 | 42 | ||
@@ -96,27 +97,33 @@ static int therm_throt_process(int curr) | |||
96 | { | 97 | { |
97 | unsigned int cpu = smp_processor_id(); | 98 | unsigned int cpu = smp_processor_id(); |
98 | __u64 tmp_jiffs = get_jiffies_64(); | 99 | __u64 tmp_jiffs = get_jiffies_64(); |
100 | bool was_throttled = __get_cpu_var(thermal_throttle_active); | ||
101 | bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; | ||
99 | 102 | ||
100 | if (curr) | 103 | if (is_throttled) |
101 | __get_cpu_var(thermal_throttle_count)++; | 104 | __get_cpu_var(thermal_throttle_count)++; |
102 | 105 | ||
103 | if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) | 106 | if (!(was_throttled ^ is_throttled) && |
107 | time_before64(tmp_jiffs, __get_cpu_var(next_check))) | ||
104 | return 0; | 108 | return 0; |
105 | 109 | ||
106 | __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; | 110 | __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; |
107 | 111 | ||
108 | /* if we just entered the thermal event */ | 112 | /* if we just entered the thermal event */ |
109 | if (curr) { | 113 | if (is_throttled) { |
110 | printk(KERN_CRIT "CPU%d: Temperature above threshold, " | 114 | printk(KERN_CRIT "CPU%d: Temperature above threshold, " |
111 | "cpu clock throttled (total events = %lu)\n", cpu, | 115 | "cpu clock throttled (total events = %lu)\n", |
112 | __get_cpu_var(thermal_throttle_count)); | 116 | cpu, __get_cpu_var(thermal_throttle_count)); |
113 | 117 | ||
114 | add_taint(TAINT_MACHINE_CHECK); | 118 | add_taint(TAINT_MACHINE_CHECK); |
115 | } else { | 119 | return 1; |
116 | printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); | 120 | } |
121 | if (was_throttled) { | ||
122 | printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); | ||
123 | return 1; | ||
117 | } | 124 | } |
118 | 125 | ||
119 | return 1; | 126 | return 0; |
120 | } | 127 | } |
121 | 128 | ||
122 | #ifdef CONFIG_SYSFS | 129 | #ifdef CONFIG_SYSFS |
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 36c3dc7b8991..900332b800f8 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c | |||
@@ -55,6 +55,7 @@ struct x86_pmu { | |||
55 | int num_counters_fixed; | 55 | int num_counters_fixed; |
56 | int counter_bits; | 56 | int counter_bits; |
57 | u64 counter_mask; | 57 | u64 counter_mask; |
58 | int apic; | ||
58 | u64 max_period; | 59 | u64 max_period; |
59 | u64 intel_ctrl; | 60 | u64 intel_ctrl; |
60 | }; | 61 | }; |
@@ -66,6 +67,52 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { | |||
66 | }; | 67 | }; |
67 | 68 | ||
68 | /* | 69 | /* |
70 | * Not sure about some of these | ||
71 | */ | ||
72 | static const u64 p6_perfmon_event_map[] = | ||
73 | { | ||
74 | [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, | ||
75 | [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, | ||
76 | [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, | ||
77 | [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, | ||
78 | [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, | ||
79 | [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, | ||
80 | [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, | ||
81 | }; | ||
82 | |||
83 | static u64 p6_pmu_event_map(int event) | ||
84 | { | ||
85 | return p6_perfmon_event_map[event]; | ||
86 | } | ||
87 | |||
88 | /* | ||
89 | * Counter setting that is specified not to count anything. | ||
90 | * We use this to effectively disable a counter. | ||
91 | * | ||
92 | * L2_RQSTS with 0 MESI unit mask. | ||
93 | */ | ||
94 | #define P6_NOP_COUNTER 0x0000002EULL | ||
95 | |||
96 | static u64 p6_pmu_raw_event(u64 event) | ||
97 | { | ||
98 | #define P6_EVNTSEL_EVENT_MASK 0x000000FFULL | ||
99 | #define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL | ||
100 | #define P6_EVNTSEL_EDGE_MASK 0x00040000ULL | ||
101 | #define P6_EVNTSEL_INV_MASK 0x00800000ULL | ||
102 | #define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL | ||
103 | |||
104 | #define P6_EVNTSEL_MASK \ | ||
105 | (P6_EVNTSEL_EVENT_MASK | \ | ||
106 | P6_EVNTSEL_UNIT_MASK | \ | ||
107 | P6_EVNTSEL_EDGE_MASK | \ | ||
108 | P6_EVNTSEL_INV_MASK | \ | ||
109 | P6_EVNTSEL_COUNTER_MASK) | ||
110 | |||
111 | return event & P6_EVNTSEL_MASK; | ||
112 | } | ||
113 | |||
114 | |||
115 | /* | ||
69 | * Intel PerfMon v3. Used on Core2 and later. | 116 | * Intel PerfMon v3. Used on Core2 and later. |
70 | */ | 117 | */ |
71 | static const u64 intel_perfmon_event_map[] = | 118 | static const u64 intel_perfmon_event_map[] = |
@@ -567,6 +614,7 @@ static DEFINE_MUTEX(pmc_reserve_mutex); | |||
567 | 614 | ||
568 | static bool reserve_pmc_hardware(void) | 615 | static bool reserve_pmc_hardware(void) |
569 | { | 616 | { |
617 | #ifdef CONFIG_X86_LOCAL_APIC | ||
570 | int i; | 618 | int i; |
571 | 619 | ||
572 | if (nmi_watchdog == NMI_LOCAL_APIC) | 620 | if (nmi_watchdog == NMI_LOCAL_APIC) |
@@ -581,9 +629,11 @@ static bool reserve_pmc_hardware(void) | |||
581 | if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) | 629 | if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) |
582 | goto eventsel_fail; | 630 | goto eventsel_fail; |
583 | } | 631 | } |
632 | #endif | ||
584 | 633 | ||
585 | return true; | 634 | return true; |
586 | 635 | ||
636 | #ifdef CONFIG_X86_LOCAL_APIC | ||
587 | eventsel_fail: | 637 | eventsel_fail: |
588 | for (i--; i >= 0; i--) | 638 | for (i--; i >= 0; i--) |
589 | release_evntsel_nmi(x86_pmu.eventsel + i); | 639 | release_evntsel_nmi(x86_pmu.eventsel + i); |
@@ -598,10 +648,12 @@ perfctr_fail: | |||
598 | enable_lapic_nmi_watchdog(); | 648 | enable_lapic_nmi_watchdog(); |
599 | 649 | ||
600 | return false; | 650 | return false; |
651 | #endif | ||
601 | } | 652 | } |
602 | 653 | ||
603 | static void release_pmc_hardware(void) | 654 | static void release_pmc_hardware(void) |
604 | { | 655 | { |
656 | #ifdef CONFIG_X86_LOCAL_APIC | ||
605 | int i; | 657 | int i; |
606 | 658 | ||
607 | for (i = 0; i < x86_pmu.num_counters; i++) { | 659 | for (i = 0; i < x86_pmu.num_counters; i++) { |
@@ -611,6 +663,7 @@ static void release_pmc_hardware(void) | |||
611 | 663 | ||
612 | if (nmi_watchdog == NMI_LOCAL_APIC) | 664 | if (nmi_watchdog == NMI_LOCAL_APIC) |
613 | enable_lapic_nmi_watchdog(); | 665 | enable_lapic_nmi_watchdog(); |
666 | #endif | ||
614 | } | 667 | } |
615 | 668 | ||
616 | static void hw_perf_counter_destroy(struct perf_counter *counter) | 669 | static void hw_perf_counter_destroy(struct perf_counter *counter) |
@@ -666,6 +719,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) | |||
666 | { | 719 | { |
667 | struct perf_counter_attr *attr = &counter->attr; | 720 | struct perf_counter_attr *attr = &counter->attr; |
668 | struct hw_perf_counter *hwc = &counter->hw; | 721 | struct hw_perf_counter *hwc = &counter->hw; |
722 | u64 config; | ||
669 | int err; | 723 | int err; |
670 | 724 | ||
671 | if (!x86_pmu_initialized()) | 725 | if (!x86_pmu_initialized()) |
@@ -701,6 +755,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) | |||
701 | hwc->sample_period = x86_pmu.max_period; | 755 | hwc->sample_period = x86_pmu.max_period; |
702 | hwc->last_period = hwc->sample_period; | 756 | hwc->last_period = hwc->sample_period; |
703 | atomic64_set(&hwc->period_left, hwc->sample_period); | 757 | atomic64_set(&hwc->period_left, hwc->sample_period); |
758 | } else { | ||
759 | /* | ||
760 | * If we have a PMU initialized but no APIC | ||
761 | * interrupts, we cannot sample hardware | ||
762 | * counters (user-space has to fall back and | ||
763 | * sample via a hrtimer based software counter): | ||
764 | */ | ||
765 | if (!x86_pmu.apic) | ||
766 | return -EOPNOTSUPP; | ||
704 | } | 767 | } |
705 | 768 | ||
706 | counter->destroy = hw_perf_counter_destroy; | 769 | counter->destroy = hw_perf_counter_destroy; |
@@ -718,14 +781,40 @@ static int __hw_perf_counter_init(struct perf_counter *counter) | |||
718 | 781 | ||
719 | if (attr->config >= x86_pmu.max_events) | 782 | if (attr->config >= x86_pmu.max_events) |
720 | return -EINVAL; | 783 | return -EINVAL; |
784 | |||
721 | /* | 785 | /* |
722 | * The generic map: | 786 | * The generic map: |
723 | */ | 787 | */ |
724 | hwc->config |= x86_pmu.event_map(attr->config); | 788 | config = x86_pmu.event_map(attr->config); |
789 | |||
790 | if (config == 0) | ||
791 | return -ENOENT; | ||
792 | |||
793 | if (config == -1LL) | ||
794 | return -EINVAL; | ||
795 | |||
796 | hwc->config |= config; | ||
725 | 797 | ||
726 | return 0; | 798 | return 0; |
727 | } | 799 | } |
728 | 800 | ||
801 | static void p6_pmu_disable_all(void) | ||
802 | { | ||
803 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
804 | u64 val; | ||
805 | |||
806 | if (!cpuc->enabled) | ||
807 | return; | ||
808 | |||
809 | cpuc->enabled = 0; | ||
810 | barrier(); | ||
811 | |||
812 | /* p6 only has one enable register */ | ||
813 | rdmsrl(MSR_P6_EVNTSEL0, val); | ||
814 | val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; | ||
815 | wrmsrl(MSR_P6_EVNTSEL0, val); | ||
816 | } | ||
817 | |||
729 | static void intel_pmu_disable_all(void) | 818 | static void intel_pmu_disable_all(void) |
730 | { | 819 | { |
731 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); | 820 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); |
@@ -767,6 +856,23 @@ void hw_perf_disable(void) | |||
767 | return x86_pmu.disable_all(); | 856 | return x86_pmu.disable_all(); |
768 | } | 857 | } |
769 | 858 | ||
859 | static void p6_pmu_enable_all(void) | ||
860 | { | ||
861 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
862 | unsigned long val; | ||
863 | |||
864 | if (cpuc->enabled) | ||
865 | return; | ||
866 | |||
867 | cpuc->enabled = 1; | ||
868 | barrier(); | ||
869 | |||
870 | /* p6 only has one enable register */ | ||
871 | rdmsrl(MSR_P6_EVNTSEL0, val); | ||
872 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; | ||
873 | wrmsrl(MSR_P6_EVNTSEL0, val); | ||
874 | } | ||
875 | |||
770 | static void intel_pmu_enable_all(void) | 876 | static void intel_pmu_enable_all(void) |
771 | { | 877 | { |
772 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); | 878 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); |
@@ -784,13 +890,13 @@ static void amd_pmu_enable_all(void) | |||
784 | barrier(); | 890 | barrier(); |
785 | 891 | ||
786 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | 892 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { |
893 | struct perf_counter *counter = cpuc->counters[idx]; | ||
787 | u64 val; | 894 | u64 val; |
788 | 895 | ||
789 | if (!test_bit(idx, cpuc->active_mask)) | 896 | if (!test_bit(idx, cpuc->active_mask)) |
790 | continue; | 897 | continue; |
791 | rdmsrl(MSR_K7_EVNTSEL0 + idx, val); | 898 | |
792 | if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) | 899 | val = counter->hw.config; |
793 | continue; | ||
794 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; | 900 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; |
795 | wrmsrl(MSR_K7_EVNTSEL0 + idx, val); | 901 | wrmsrl(MSR_K7_EVNTSEL0 + idx, val); |
796 | } | 902 | } |
@@ -819,16 +925,13 @@ static inline void intel_pmu_ack_status(u64 ack) | |||
819 | 925 | ||
820 | static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) | 926 | static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) |
821 | { | 927 | { |
822 | int err; | 928 | (void)checking_wrmsrl(hwc->config_base + idx, |
823 | err = checking_wrmsrl(hwc->config_base + idx, | ||
824 | hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); | 929 | hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); |
825 | } | 930 | } |
826 | 931 | ||
827 | static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) | 932 | static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) |
828 | { | 933 | { |
829 | int err; | 934 | (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); |
830 | err = checking_wrmsrl(hwc->config_base + idx, | ||
831 | hwc->config); | ||
832 | } | 935 | } |
833 | 936 | ||
834 | static inline void | 937 | static inline void |
@@ -836,13 +939,24 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) | |||
836 | { | 939 | { |
837 | int idx = __idx - X86_PMC_IDX_FIXED; | 940 | int idx = __idx - X86_PMC_IDX_FIXED; |
838 | u64 ctrl_val, mask; | 941 | u64 ctrl_val, mask; |
839 | int err; | ||
840 | 942 | ||
841 | mask = 0xfULL << (idx * 4); | 943 | mask = 0xfULL << (idx * 4); |
842 | 944 | ||
843 | rdmsrl(hwc->config_base, ctrl_val); | 945 | rdmsrl(hwc->config_base, ctrl_val); |
844 | ctrl_val &= ~mask; | 946 | ctrl_val &= ~mask; |
845 | err = checking_wrmsrl(hwc->config_base, ctrl_val); | 947 | (void)checking_wrmsrl(hwc->config_base, ctrl_val); |
948 | } | ||
949 | |||
950 | static inline void | ||
951 | p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) | ||
952 | { | ||
953 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
954 | u64 val = P6_NOP_COUNTER; | ||
955 | |||
956 | if (cpuc->enabled) | ||
957 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; | ||
958 | |||
959 | (void)checking_wrmsrl(hwc->config_base + idx, val); | ||
846 | } | 960 | } |
847 | 961 | ||
848 | static inline void | 962 | static inline void |
@@ -943,6 +1057,19 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) | |||
943 | err = checking_wrmsrl(hwc->config_base, ctrl_val); | 1057 | err = checking_wrmsrl(hwc->config_base, ctrl_val); |
944 | } | 1058 | } |
945 | 1059 | ||
1060 | static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) | ||
1061 | { | ||
1062 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | ||
1063 | u64 val; | ||
1064 | |||
1065 | val = hwc->config; | ||
1066 | if (cpuc->enabled) | ||
1067 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; | ||
1068 | |||
1069 | (void)checking_wrmsrl(hwc->config_base + idx, val); | ||
1070 | } | ||
1071 | |||
1072 | |||
946 | static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) | 1073 | static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) |
947 | { | 1074 | { |
948 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { | 1075 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { |
@@ -959,8 +1086,6 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) | |||
959 | 1086 | ||
960 | if (cpuc->enabled) | 1087 | if (cpuc->enabled) |
961 | x86_pmu_enable_counter(hwc, idx); | 1088 | x86_pmu_enable_counter(hwc, idx); |
962 | else | ||
963 | x86_pmu_disable_counter(hwc, idx); | ||
964 | } | 1089 | } |
965 | 1090 | ||
966 | static int | 1091 | static int |
@@ -1176,6 +1301,49 @@ static void intel_pmu_reset(void) | |||
1176 | local_irq_restore(flags); | 1301 | local_irq_restore(flags); |
1177 | } | 1302 | } |
1178 | 1303 | ||
1304 | static int p6_pmu_handle_irq(struct pt_regs *regs) | ||
1305 | { | ||
1306 | struct perf_sample_data data; | ||
1307 | struct cpu_hw_counters *cpuc; | ||
1308 | struct perf_counter *counter; | ||
1309 | struct hw_perf_counter *hwc; | ||
1310 | int idx, handled = 0; | ||
1311 | u64 val; | ||
1312 | |||
1313 | data.regs = regs; | ||
1314 | data.addr = 0; | ||
1315 | |||
1316 | cpuc = &__get_cpu_var(cpu_hw_counters); | ||
1317 | |||
1318 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | ||
1319 | if (!test_bit(idx, cpuc->active_mask)) | ||
1320 | continue; | ||
1321 | |||
1322 | counter = cpuc->counters[idx]; | ||
1323 | hwc = &counter->hw; | ||
1324 | |||
1325 | val = x86_perf_counter_update(counter, hwc, idx); | ||
1326 | if (val & (1ULL << (x86_pmu.counter_bits - 1))) | ||
1327 | continue; | ||
1328 | |||
1329 | /* | ||
1330 | * counter overflow | ||
1331 | */ | ||
1332 | handled = 1; | ||
1333 | data.period = counter->hw.last_period; | ||
1334 | |||
1335 | if (!x86_perf_counter_set_period(counter, hwc, idx)) | ||
1336 | continue; | ||
1337 | |||
1338 | if (perf_counter_overflow(counter, 1, &data)) | ||
1339 | p6_pmu_disable_counter(hwc, idx); | ||
1340 | } | ||
1341 | |||
1342 | if (handled) | ||
1343 | inc_irq_stat(apic_perf_irqs); | ||
1344 | |||
1345 | return handled; | ||
1346 | } | ||
1179 | 1347 | ||
1180 | /* | 1348 | /* |
1181 | * This handler is triggered by the local APIC, so the APIC IRQ handling | 1349 | * This handler is triggered by the local APIC, so the APIC IRQ handling |
@@ -1185,14 +1353,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) | |||
1185 | { | 1353 | { |
1186 | struct perf_sample_data data; | 1354 | struct perf_sample_data data; |
1187 | struct cpu_hw_counters *cpuc; | 1355 | struct cpu_hw_counters *cpuc; |
1188 | int bit, cpu, loops; | 1356 | int bit, loops; |
1189 | u64 ack, status; | 1357 | u64 ack, status; |
1190 | 1358 | ||
1191 | data.regs = regs; | 1359 | data.regs = regs; |
1192 | data.addr = 0; | 1360 | data.addr = 0; |
1193 | 1361 | ||
1194 | cpu = smp_processor_id(); | 1362 | cpuc = &__get_cpu_var(cpu_hw_counters); |
1195 | cpuc = &per_cpu(cpu_hw_counters, cpu); | ||
1196 | 1363 | ||
1197 | perf_disable(); | 1364 | perf_disable(); |
1198 | status = intel_pmu_get_status(); | 1365 | status = intel_pmu_get_status(); |
@@ -1249,14 +1416,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) | |||
1249 | struct cpu_hw_counters *cpuc; | 1416 | struct cpu_hw_counters *cpuc; |
1250 | struct perf_counter *counter; | 1417 | struct perf_counter *counter; |
1251 | struct hw_perf_counter *hwc; | 1418 | struct hw_perf_counter *hwc; |
1252 | int cpu, idx, handled = 0; | 1419 | int idx, handled = 0; |
1253 | u64 val; | 1420 | u64 val; |
1254 | 1421 | ||
1255 | data.regs = regs; | 1422 | data.regs = regs; |
1256 | data.addr = 0; | 1423 | data.addr = 0; |
1257 | 1424 | ||
1258 | cpu = smp_processor_id(); | 1425 | cpuc = &__get_cpu_var(cpu_hw_counters); |
1259 | cpuc = &per_cpu(cpu_hw_counters, cpu); | ||
1260 | 1426 | ||
1261 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | 1427 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { |
1262 | if (!test_bit(idx, cpuc->active_mask)) | 1428 | if (!test_bit(idx, cpuc->active_mask)) |
@@ -1299,18 +1465,22 @@ void smp_perf_pending_interrupt(struct pt_regs *regs) | |||
1299 | 1465 | ||
1300 | void set_perf_counter_pending(void) | 1466 | void set_perf_counter_pending(void) |
1301 | { | 1467 | { |
1468 | #ifdef CONFIG_X86_LOCAL_APIC | ||
1302 | apic->send_IPI_self(LOCAL_PENDING_VECTOR); | 1469 | apic->send_IPI_self(LOCAL_PENDING_VECTOR); |
1470 | #endif | ||
1303 | } | 1471 | } |
1304 | 1472 | ||
1305 | void perf_counters_lapic_init(void) | 1473 | void perf_counters_lapic_init(void) |
1306 | { | 1474 | { |
1307 | if (!x86_pmu_initialized()) | 1475 | #ifdef CONFIG_X86_LOCAL_APIC |
1476 | if (!x86_pmu.apic || !x86_pmu_initialized()) | ||
1308 | return; | 1477 | return; |
1309 | 1478 | ||
1310 | /* | 1479 | /* |
1311 | * Always use NMI for PMU | 1480 | * Always use NMI for PMU |
1312 | */ | 1481 | */ |
1313 | apic_write(APIC_LVTPC, APIC_DM_NMI); | 1482 | apic_write(APIC_LVTPC, APIC_DM_NMI); |
1483 | #endif | ||
1314 | } | 1484 | } |
1315 | 1485 | ||
1316 | static int __kprobes | 1486 | static int __kprobes |
@@ -1334,7 +1504,9 @@ perf_counter_nmi_handler(struct notifier_block *self, | |||
1334 | 1504 | ||
1335 | regs = args->regs; | 1505 | regs = args->regs; |
1336 | 1506 | ||
1507 | #ifdef CONFIG_X86_LOCAL_APIC | ||
1337 | apic_write(APIC_LVTPC, APIC_DM_NMI); | 1508 | apic_write(APIC_LVTPC, APIC_DM_NMI); |
1509 | #endif | ||
1338 | /* | 1510 | /* |
1339 | * Can't rely on the handled return value to say it was our NMI, two | 1511 | * Can't rely on the handled return value to say it was our NMI, two |
1340 | * counters could trigger 'simultaneously' raising two back-to-back NMIs. | 1512 | * counters could trigger 'simultaneously' raising two back-to-back NMIs. |
@@ -1353,6 +1525,33 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { | |||
1353 | .priority = 1 | 1525 | .priority = 1 |
1354 | }; | 1526 | }; |
1355 | 1527 | ||
1528 | static struct x86_pmu p6_pmu = { | ||
1529 | .name = "p6", | ||
1530 | .handle_irq = p6_pmu_handle_irq, | ||
1531 | .disable_all = p6_pmu_disable_all, | ||
1532 | .enable_all = p6_pmu_enable_all, | ||
1533 | .enable = p6_pmu_enable_counter, | ||
1534 | .disable = p6_pmu_disable_counter, | ||
1535 | .eventsel = MSR_P6_EVNTSEL0, | ||
1536 | .perfctr = MSR_P6_PERFCTR0, | ||
1537 | .event_map = p6_pmu_event_map, | ||
1538 | .raw_event = p6_pmu_raw_event, | ||
1539 | .max_events = ARRAY_SIZE(p6_perfmon_event_map), | ||
1540 | .apic = 1, | ||
1541 | .max_period = (1ULL << 31) - 1, | ||
1542 | .version = 0, | ||
1543 | .num_counters = 2, | ||
1544 | /* | ||
1545 | * Counters have 40 bits implemented. However they are designed such | ||
1546 | * that bits [32-39] are sign extensions of bit 31. As such the | ||
1547 | * effective width of a counter for P6-like PMU is 32 bits only. | ||
1548 | * | ||
1549 | * See IA-32 Intel Architecture Software developer manual Vol 3B | ||
1550 | */ | ||
1551 | .counter_bits = 32, | ||
1552 | .counter_mask = (1ULL << 32) - 1, | ||
1553 | }; | ||
1554 | |||
1356 | static struct x86_pmu intel_pmu = { | 1555 | static struct x86_pmu intel_pmu = { |
1357 | .name = "Intel", | 1556 | .name = "Intel", |
1358 | .handle_irq = intel_pmu_handle_irq, | 1557 | .handle_irq = intel_pmu_handle_irq, |
@@ -1365,6 +1564,7 @@ static struct x86_pmu intel_pmu = { | |||
1365 | .event_map = intel_pmu_event_map, | 1564 | .event_map = intel_pmu_event_map, |
1366 | .raw_event = intel_pmu_raw_event, | 1565 | .raw_event = intel_pmu_raw_event, |
1367 | .max_events = ARRAY_SIZE(intel_perfmon_event_map), | 1566 | .max_events = ARRAY_SIZE(intel_perfmon_event_map), |
1567 | .apic = 1, | ||
1368 | /* | 1568 | /* |
1369 | * Intel PMCs cannot be accessed sanely above 32 bit width, | 1569 | * Intel PMCs cannot be accessed sanely above 32 bit width, |
1370 | * so we install an artificial 1<<31 period regardless of | 1570 | * so we install an artificial 1<<31 period regardless of |
@@ -1388,10 +1588,43 @@ static struct x86_pmu amd_pmu = { | |||
1388 | .num_counters = 4, | 1588 | .num_counters = 4, |
1389 | .counter_bits = 48, | 1589 | .counter_bits = 48, |
1390 | .counter_mask = (1ULL << 48) - 1, | 1590 | .counter_mask = (1ULL << 48) - 1, |
1591 | .apic = 1, | ||
1391 | /* use highest bit to detect overflow */ | 1592 | /* use highest bit to detect overflow */ |
1392 | .max_period = (1ULL << 47) - 1, | 1593 | .max_period = (1ULL << 47) - 1, |
1393 | }; | 1594 | }; |
1394 | 1595 | ||
1596 | static int p6_pmu_init(void) | ||
1597 | { | ||
1598 | switch (boot_cpu_data.x86_model) { | ||
1599 | case 1: | ||
1600 | case 3: /* Pentium Pro */ | ||
1601 | case 5: | ||
1602 | case 6: /* Pentium II */ | ||
1603 | case 7: | ||
1604 | case 8: | ||
1605 | case 11: /* Pentium III */ | ||
1606 | break; | ||
1607 | case 9: | ||
1608 | case 13: | ||
1609 | /* Pentium M */ | ||
1610 | break; | ||
1611 | default: | ||
1612 | pr_cont("unsupported p6 CPU model %d ", | ||
1613 | boot_cpu_data.x86_model); | ||
1614 | return -ENODEV; | ||
1615 | } | ||
1616 | |||
1617 | x86_pmu = p6_pmu; | ||
1618 | |||
1619 | if (!cpu_has_apic) { | ||
1620 | pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); | ||
1621 | pr_info("no hardware sampling interrupt available.\n"); | ||
1622 | x86_pmu.apic = 0; | ||
1623 | } | ||
1624 | |||
1625 | return 0; | ||
1626 | } | ||
1627 | |||
1395 | static int intel_pmu_init(void) | 1628 | static int intel_pmu_init(void) |
1396 | { | 1629 | { |
1397 | union cpuid10_edx edx; | 1630 | union cpuid10_edx edx; |
@@ -1400,8 +1633,14 @@ static int intel_pmu_init(void) | |||
1400 | unsigned int ebx; | 1633 | unsigned int ebx; |
1401 | int version; | 1634 | int version; |
1402 | 1635 | ||
1403 | if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) | 1636 | if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { |
1637 | /* check for P6 processor family */ | ||
1638 | if (boot_cpu_data.x86 == 6) { | ||
1639 | return p6_pmu_init(); | ||
1640 | } else { | ||
1404 | return -ENODEV; | 1641 | return -ENODEV; |
1642 | } | ||
1643 | } | ||
1405 | 1644 | ||
1406 | /* | 1645 | /* |
1407 | * Check whether the Architectural PerfMon supports | 1646 | * Check whether the Architectural PerfMon supports |
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 96f7ac0bbf01..fe26ba3e3451 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c | |||
@@ -354,7 +354,7 @@ void __init efi_init(void) | |||
354 | */ | 354 | */ |
355 | c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); | 355 | c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); |
356 | if (c16) { | 356 | if (c16) { |
357 | for (i = 0; i < sizeof(vendor) && *c16; ++i) | 357 | for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i) |
358 | vendor[i] = *c16++; | 358 | vendor[i] = *c16++; |
359 | vendor[i] = '\0'; | 359 | vendor[i] = '\0'; |
360 | } else | 360 | } else |
@@ -512,7 +512,7 @@ void __init efi_enter_virtual_mode(void) | |||
512 | && end_pfn <= max_pfn_mapped)) | 512 | && end_pfn <= max_pfn_mapped)) |
513 | va = __va(md->phys_addr); | 513 | va = __va(md->phys_addr); |
514 | else | 514 | else |
515 | va = efi_ioremap(md->phys_addr, size); | 515 | va = efi_ioremap(md->phys_addr, size, md->type); |
516 | 516 | ||
517 | md->virt_addr = (u64) (unsigned long) va; | 517 | md->virt_addr = (u64) (unsigned long) va; |
518 | 518 | ||
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index 22c3b7828c50..ac0621a7ac3d 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c | |||
@@ -98,10 +98,14 @@ void __init efi_call_phys_epilog(void) | |||
98 | early_runtime_code_mapping_set_exec(0); | 98 | early_runtime_code_mapping_set_exec(0); |
99 | } | 99 | } |
100 | 100 | ||
101 | void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) | 101 | void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, |
102 | u32 type) | ||
102 | { | 103 | { |
103 | unsigned long last_map_pfn; | 104 | unsigned long last_map_pfn; |
104 | 105 | ||
106 | if (type == EFI_MEMORY_MAPPED_IO) | ||
107 | return ioremap(phys_addr, size); | ||
108 | |||
105 | last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); | 109 | last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); |
106 | if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) | 110 | if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) |
107 | return NULL; | 111 | return NULL; |
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 8663afb56535..cc827ac9e8d3 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
@@ -261,9 +261,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); | |||
261 | * which will be freed later | 261 | * which will be freed later |
262 | */ | 262 | */ |
263 | 263 | ||
264 | #ifndef CONFIG_HOTPLUG_CPU | 264 | __CPUINIT |
265 | .section .init.text,"ax",@progbits | ||
266 | #endif | ||
267 | 265 | ||
268 | #ifdef CONFIG_SMP | 266 | #ifdef CONFIG_SMP |
269 | ENTRY(startup_32_smp) | 267 | ENTRY(startup_32_smp) |
@@ -602,7 +600,7 @@ ignore_int: | |||
602 | #endif | 600 | #endif |
603 | iret | 601 | iret |
604 | 602 | ||
605 | .section .cpuinit.data,"wa" | 603 | __REFDATA |
606 | .align 4 | 604 | .align 4 |
607 | ENTRY(initial_code) | 605 | ENTRY(initial_code) |
608 | .long i386_start_kernel | 606 | .long i386_start_kernel |
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 696f0e475c2d..92b7703d3d58 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c | |||
@@ -187,7 +187,7 @@ static void __init apic_intr_init(void) | |||
187 | #ifdef CONFIG_X86_THERMAL_VECTOR | 187 | #ifdef CONFIG_X86_THERMAL_VECTOR |
188 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | 188 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); |
189 | #endif | 189 | #endif |
190 | #ifdef CONFIG_X86_THRESHOLD | 190 | #ifdef CONFIG_X86_MCE_THRESHOLD |
191 | alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); | 191 | alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); |
192 | #endif | 192 | #endif |
193 | #if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) | 193 | #if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) |
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 846510b78a09..2a62d843f015 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c | |||
@@ -347,7 +347,7 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id) | |||
347 | 347 | ||
348 | static struct irqaction mfgptirq = { | 348 | static struct irqaction mfgptirq = { |
349 | .handler = mfgpt_tick, | 349 | .handler = mfgpt_tick, |
350 | .flags = IRQF_DISABLED | IRQF_NOBALANCING, | 350 | .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER, |
351 | .name = "mfgpt-timer" | 351 | .name = "mfgpt-timer" |
352 | }; | 352 | }; |
353 | 353 | ||
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 994dd6a4a2a0..071166a4ba83 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -519,16 +519,12 @@ static void c1e_idle(void) | |||
519 | if (!cpumask_test_cpu(cpu, c1e_mask)) { | 519 | if (!cpumask_test_cpu(cpu, c1e_mask)) { |
520 | cpumask_set_cpu(cpu, c1e_mask); | 520 | cpumask_set_cpu(cpu, c1e_mask); |
521 | /* | 521 | /* |
522 | * Force broadcast so ACPI can not interfere. Needs | 522 | * Force broadcast so ACPI can not interfere. |
523 | * to run with interrupts enabled as it uses | ||
524 | * smp_function_call. | ||
525 | */ | 523 | */ |
526 | local_irq_enable(); | ||
527 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, | 524 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, |
528 | &cpu); | 525 | &cpu); |
529 | printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", | 526 | printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", |
530 | cpu); | 527 | cpu); |
531 | local_irq_disable(); | ||
532 | } | 528 | } |
533 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); | 529 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); |
534 | 530 | ||
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 18ce5c04242a..27349f92a6d7 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
@@ -3,6 +3,7 @@ | |||
3 | #include <linux/init.h> | 3 | #include <linux/init.h> |
4 | #include <linux/pm.h> | 4 | #include <linux/pm.h> |
5 | #include <linux/efi.h> | 5 | #include <linux/efi.h> |
6 | #include <linux/dmi.h> | ||
6 | #include <linux/tboot.h> | 7 | #include <linux/tboot.h> |
7 | #include <acpi/reboot.h> | 8 | #include <acpi/reboot.h> |
8 | #include <asm/io.h> | 9 | #include <asm/io.h> |
@@ -18,7 +19,6 @@ | |||
18 | #include <asm/cpu.h> | 19 | #include <asm/cpu.h> |
19 | 20 | ||
20 | #ifdef CONFIG_X86_32 | 21 | #ifdef CONFIG_X86_32 |
21 | # include <linux/dmi.h> | ||
22 | # include <linux/ctype.h> | 22 | # include <linux/ctype.h> |
23 | # include <linux/mc146818rtc.h> | 23 | # include <linux/mc146818rtc.h> |
24 | #else | 24 | #else |
@@ -250,6 +250,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
250 | DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), | 250 | DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), |
251 | }, | 251 | }, |
252 | }, | 252 | }, |
253 | { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */ | ||
254 | .callback = set_bios_reboot, | ||
255 | .ident = "CompuLab SBC-FITPC2", | ||
256 | .matches = { | ||
257 | DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"), | ||
258 | DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), | ||
259 | }, | ||
260 | }, | ||
253 | { } | 261 | { } |
254 | }; | 262 | }; |
255 | 263 | ||
@@ -397,6 +405,46 @@ EXPORT_SYMBOL(machine_real_restart); | |||
397 | 405 | ||
398 | #endif /* CONFIG_X86_32 */ | 406 | #endif /* CONFIG_X86_32 */ |
399 | 407 | ||
408 | /* | ||
409 | * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot | ||
410 | */ | ||
411 | static int __init set_pci_reboot(const struct dmi_system_id *d) | ||
412 | { | ||
413 | if (reboot_type != BOOT_CF9) { | ||
414 | reboot_type = BOOT_CF9; | ||
415 | printk(KERN_INFO "%s series board detected. " | ||
416 | "Selecting PCI-method for reboots.\n", d->ident); | ||
417 | } | ||
418 | return 0; | ||
419 | } | ||
420 | |||
421 | static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { | ||
422 | { /* Handle problems with rebooting on Apple MacBook5 */ | ||
423 | .callback = set_pci_reboot, | ||
424 | .ident = "Apple MacBook5", | ||
425 | .matches = { | ||
426 | DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), | ||
427 | DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), | ||
428 | }, | ||
429 | }, | ||
430 | { /* Handle problems with rebooting on Apple MacBookPro5 */ | ||
431 | .callback = set_pci_reboot, | ||
432 | .ident = "Apple MacBookPro5", | ||
433 | .matches = { | ||
434 | DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), | ||
435 | DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), | ||
436 | }, | ||
437 | }, | ||
438 | { } | ||
439 | }; | ||
440 | |||
441 | static int __init pci_reboot_init(void) | ||
442 | { | ||
443 | dmi_check_system(pci_reboot_dmi_table); | ||
444 | return 0; | ||
445 | } | ||
446 | core_initcall(pci_reboot_init); | ||
447 | |||
400 | static inline void kb_wait(void) | 448 | static inline void kb_wait(void) |
401 | { | 449 | { |
402 | int i; | 450 | int i; |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 6ce0d6f38f7f..61f86f241420 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -673,6 +673,19 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { | |||
673 | DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), | 673 | DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), |
674 | }, | 674 | }, |
675 | }, | 675 | }, |
676 | { | ||
677 | /* | ||
678 | * AMI BIOS with low memory corruption was found on Intel DG45ID board. | ||
679 | * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will | ||
680 | * match only DMI_BOARD_NAME and see if there is more bad products | ||
681 | * with this vendor. | ||
682 | */ | ||
683 | .callback = dmi_low_memory_corruption, | ||
684 | .ident = "AMI BIOS", | ||
685 | .matches = { | ||
686 | DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), | ||
687 | }, | ||
688 | }, | ||
676 | #endif | 689 | #endif |
677 | {} | 690 | {} |
678 | }; | 691 | }; |
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 29a3eef7cf4a..07d81916f212 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c | |||
@@ -165,7 +165,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) | |||
165 | 165 | ||
166 | if (!chosen) { | 166 | if (!chosen) { |
167 | size_t vm_size = VMALLOC_END - VMALLOC_START; | 167 | size_t vm_size = VMALLOC_END - VMALLOC_START; |
168 | size_t tot_size = num_possible_cpus() * PMD_SIZE; | 168 | size_t tot_size = nr_cpu_ids * PMD_SIZE; |
169 | 169 | ||
170 | /* on non-NUMA, embedding is better */ | 170 | /* on non-NUMA, embedding is better */ |
171 | if (!pcpu_need_numa()) | 171 | if (!pcpu_need_numa()) |
@@ -199,7 +199,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) | |||
199 | dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; | 199 | dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; |
200 | 200 | ||
201 | /* allocate pointer array and alloc large pages */ | 201 | /* allocate pointer array and alloc large pages */ |
202 | map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); | 202 | map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0])); |
203 | pcpul_map = alloc_bootmem(map_size); | 203 | pcpul_map = alloc_bootmem(map_size); |
204 | 204 | ||
205 | for_each_possible_cpu(cpu) { | 205 | for_each_possible_cpu(cpu) { |
@@ -228,7 +228,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) | |||
228 | 228 | ||
229 | /* allocate address and map */ | 229 | /* allocate address and map */ |
230 | pcpul_vm.flags = VM_ALLOC; | 230 | pcpul_vm.flags = VM_ALLOC; |
231 | pcpul_vm.size = num_possible_cpus() * PMD_SIZE; | 231 | pcpul_vm.size = nr_cpu_ids * PMD_SIZE; |
232 | vm_area_register_early(&pcpul_vm, PMD_SIZE); | 232 | vm_area_register_early(&pcpul_vm, PMD_SIZE); |
233 | 233 | ||
234 | for_each_possible_cpu(cpu) { | 234 | for_each_possible_cpu(cpu) { |
@@ -250,8 +250,8 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) | |||
250 | PMD_SIZE, pcpul_vm.addr, NULL); | 250 | PMD_SIZE, pcpul_vm.addr, NULL); |
251 | 251 | ||
252 | /* sort pcpul_map array for pcpu_lpage_remapped() */ | 252 | /* sort pcpul_map array for pcpu_lpage_remapped() */ |
253 | for (i = 0; i < num_possible_cpus() - 1; i++) | 253 | for (i = 0; i < nr_cpu_ids - 1; i++) |
254 | for (j = i + 1; j < num_possible_cpus(); j++) | 254 | for (j = i + 1; j < nr_cpu_ids; j++) |
255 | if (pcpul_map[i].ptr > pcpul_map[j].ptr) { | 255 | if (pcpul_map[i].ptr > pcpul_map[j].ptr) { |
256 | struct pcpul_ent tmp = pcpul_map[i]; | 256 | struct pcpul_ent tmp = pcpul_map[i]; |
257 | pcpul_map[i] = pcpul_map[j]; | 257 | pcpul_map[i] = pcpul_map[j]; |
@@ -288,7 +288,7 @@ void *pcpu_lpage_remapped(void *kaddr) | |||
288 | { | 288 | { |
289 | void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); | 289 | void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); |
290 | unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; | 290 | unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; |
291 | int left = 0, right = num_possible_cpus() - 1; | 291 | int left = 0, right = nr_cpu_ids - 1; |
292 | int pos; | 292 | int pos; |
293 | 293 | ||
294 | /* pcpul in use at all? */ | 294 | /* pcpul in use at all? */ |
@@ -377,7 +377,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) | |||
377 | pcpu4k_nr_static_pages = PFN_UP(static_size); | 377 | pcpu4k_nr_static_pages = PFN_UP(static_size); |
378 | 378 | ||
379 | /* unaligned allocations can't be freed, round up to page size */ | 379 | /* unaligned allocations can't be freed, round up to page size */ |
380 | pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() | 380 | pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids |
381 | * sizeof(pcpu4k_pages[0])); | 381 | * sizeof(pcpu4k_pages[0])); |
382 | pcpu4k_pages = alloc_bootmem(pages_size); | 382 | pcpu4k_pages = alloc_bootmem(pages_size); |
383 | 383 | ||
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 8ccabb8a2f6a..77b9689f8edb 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c | |||
@@ -744,6 +744,7 @@ uv_activation_descriptor_init(int node, int pnode) | |||
744 | * note that base_dest_nodeid is actually a nasid. | 744 | * note that base_dest_nodeid is actually a nasid. |
745 | */ | 745 | */ |
746 | ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; | 746 | ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; |
747 | ad2->header.dest_subnodeid = 0x10; /* the LB */ | ||
747 | ad2->header.command = UV_NET_ENDPOINT_INTD; | 748 | ad2->header.command = UV_NET_ENDPOINT_INTD; |
748 | ad2->header.int_both = 1; | 749 | ad2->header.int_both = 1; |
749 | /* | 750 | /* |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6e1a368d21d4..71f4368b357e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -275,15 +275,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) | |||
275 | * use the TSC value at the transitions to calculate a pretty | 275 | * use the TSC value at the transitions to calculate a pretty |
276 | * good value for the TSC frequencty. | 276 | * good value for the TSC frequencty. |
277 | */ | 277 | */ |
278 | static inline int pit_verify_msb(unsigned char val) | ||
279 | { | ||
280 | /* Ignore LSB */ | ||
281 | inb(0x42); | ||
282 | return inb(0x42) == val; | ||
283 | } | ||
284 | |||
278 | static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) | 285 | static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) |
279 | { | 286 | { |
280 | int count; | 287 | int count; |
281 | u64 tsc = 0; | 288 | u64 tsc = 0; |
282 | 289 | ||
283 | for (count = 0; count < 50000; count++) { | 290 | for (count = 0; count < 50000; count++) { |
284 | /* Ignore LSB */ | 291 | if (!pit_verify_msb(val)) |
285 | inb(0x42); | ||
286 | if (inb(0x42) != val) | ||
287 | break; | 292 | break; |
288 | tsc = get_cycles(); | 293 | tsc = get_cycles(); |
289 | } | 294 | } |
@@ -336,8 +341,7 @@ static unsigned long quick_pit_calibrate(void) | |||
336 | * to do that is to just read back the 16-bit counter | 341 | * to do that is to just read back the 16-bit counter |
337 | * once from the PIT. | 342 | * once from the PIT. |
338 | */ | 343 | */ |
339 | inb(0x42); | 344 | pit_verify_msb(0); |
340 | inb(0x42); | ||
341 | 345 | ||
342 | if (pit_expect_msb(0xff, &tsc, &d1)) { | 346 | if (pit_expect_msb(0xff, &tsc, &d1)) { |
343 | for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { | 347 | for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { |
@@ -348,8 +352,19 @@ static unsigned long quick_pit_calibrate(void) | |||
348 | * Iterate until the error is less than 500 ppm | 352 | * Iterate until the error is less than 500 ppm |
349 | */ | 353 | */ |
350 | delta -= tsc; | 354 | delta -= tsc; |
351 | if (d1+d2 < delta >> 11) | 355 | if (d1+d2 >= delta >> 11) |
352 | goto success; | 356 | continue; |
357 | |||
358 | /* | ||
359 | * Check the PIT one more time to verify that | ||
360 | * all TSC reads were stable wrt the PIT. | ||
361 | * | ||
362 | * This also guarantees serialization of the | ||
363 | * last cycle read ('d2') in pit_expect_msb. | ||
364 | */ | ||
365 | if (!pit_verify_msb(0xfe - i)) | ||
366 | break; | ||
367 | goto success; | ||
353 | } | 368 | } |
354 | } | 369 | } |
355 | printk("Fast TSC calibration failed\n"); | 370 | printk("Fast TSC calibration failed\n"); |
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index b263423fbe2a..95a7289e4b0c 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c | |||
@@ -441,7 +441,7 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, | |||
441 | ap.ds = __USER_DS; | 441 | ap.ds = __USER_DS; |
442 | ap.es = __USER_DS; | 442 | ap.es = __USER_DS; |
443 | ap.fs = __KERNEL_PERCPU; | 443 | ap.fs = __KERNEL_PERCPU; |
444 | ap.gs = 0; | 444 | ap.gs = __KERNEL_STACK_CANARY; |
445 | 445 | ||
446 | ap.eflags = 0; | 446 | ap.eflags = 0; |
447 | 447 | ||
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 367e87882041..9fc178255c04 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S | |||
@@ -46,11 +46,10 @@ PHDRS { | |||
46 | data PT_LOAD FLAGS(7); /* RWE */ | 46 | data PT_LOAD FLAGS(7); /* RWE */ |
47 | #ifdef CONFIG_X86_64 | 47 | #ifdef CONFIG_X86_64 |
48 | user PT_LOAD FLAGS(7); /* RWE */ | 48 | user PT_LOAD FLAGS(7); /* RWE */ |
49 | data.init PT_LOAD FLAGS(7); /* RWE */ | ||
50 | #ifdef CONFIG_SMP | 49 | #ifdef CONFIG_SMP |
51 | percpu PT_LOAD FLAGS(7); /* RWE */ | 50 | percpu PT_LOAD FLAGS(7); /* RWE */ |
52 | #endif | 51 | #endif |
53 | data.init2 PT_LOAD FLAGS(7); /* RWE */ | 52 | init PT_LOAD FLAGS(7); /* RWE */ |
54 | #endif | 53 | #endif |
55 | note PT_NOTE FLAGS(0); /* ___ */ | 54 | note PT_NOTE FLAGS(0); /* ___ */ |
56 | } | 55 | } |
@@ -103,72 +102,43 @@ SECTIONS | |||
103 | __stop___ex_table = .; | 102 | __stop___ex_table = .; |
104 | } :text = 0x9090 | 103 | } :text = 0x9090 |
105 | 104 | ||
106 | RODATA | 105 | RO_DATA(PAGE_SIZE) |
107 | 106 | ||
108 | /* Data */ | 107 | /* Data */ |
109 | . = ALIGN(PAGE_SIZE); | ||
110 | .data : AT(ADDR(.data) - LOAD_OFFSET) { | 108 | .data : AT(ADDR(.data) - LOAD_OFFSET) { |
111 | /* Start of data section */ | 109 | /* Start of data section */ |
112 | _sdata = .; | 110 | _sdata = .; |
113 | DATA_DATA | ||
114 | CONSTRUCTORS | ||
115 | 111 | ||
116 | #ifdef CONFIG_X86_64 | 112 | /* init_task */ |
117 | /* End of data section */ | 113 | INIT_TASK_DATA(THREAD_SIZE) |
118 | _edata = .; | ||
119 | #endif | ||
120 | } :data | ||
121 | 114 | ||
122 | #ifdef CONFIG_X86_32 | 115 | #ifdef CONFIG_X86_32 |
123 | /* 32 bit has nosave before _edata */ | 116 | /* 32 bit has nosave before _edata */ |
124 | . = ALIGN(PAGE_SIZE); | 117 | NOSAVE_DATA |
125 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { | ||
126 | __nosave_begin = .; | ||
127 | *(.data.nosave) | ||
128 | . = ALIGN(PAGE_SIZE); | ||
129 | __nosave_end = .; | ||
130 | } | ||
131 | #endif | 118 | #endif |
132 | 119 | ||
133 | . = ALIGN(PAGE_SIZE); | 120 | PAGE_ALIGNED_DATA(PAGE_SIZE) |
134 | .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { | ||
135 | *(.data.page_aligned) | ||
136 | *(.data.idt) | 121 | *(.data.idt) |
137 | } | ||
138 | 122 | ||
139 | #ifdef CONFIG_X86_32 | 123 | CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) |
140 | . = ALIGN(32); | ||
141 | #else | ||
142 | . = ALIGN(PAGE_SIZE); | ||
143 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
144 | #endif | ||
145 | .data.cacheline_aligned : | ||
146 | AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { | ||
147 | *(.data.cacheline_aligned) | ||
148 | } | ||
149 | 124 | ||
150 | /* rarely changed data like cpu maps */ | 125 | DATA_DATA |
151 | #ifdef CONFIG_X86_32 | 126 | CONSTRUCTORS |
152 | . = ALIGN(32); | 127 | |
153 | #else | 128 | /* rarely changed data like cpu maps */ |
154 | . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); | 129 | READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) |
155 | #endif | ||
156 | .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { | ||
157 | *(.data.read_mostly) | ||
158 | 130 | ||
159 | #ifdef CONFIG_X86_32 | ||
160 | /* End of data section */ | 131 | /* End of data section */ |
161 | _edata = .; | 132 | _edata = .; |
162 | #endif | 133 | } :data |
163 | } | ||
164 | 134 | ||
165 | #ifdef CONFIG_X86_64 | 135 | #ifdef CONFIG_X86_64 |
166 | 136 | ||
167 | #define VSYSCALL_ADDR (-10*1024*1024) | 137 | #define VSYSCALL_ADDR (-10*1024*1024) |
168 | #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ | 138 | #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \ |
169 | SIZEOF(.data.read_mostly) + 4095) & ~(4095)) | 139 | PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) |
170 | #define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ | 140 | #define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \ |
171 | SIZEOF(.data.read_mostly) + 4095) & ~(4095)) | 141 | PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) |
172 | 142 | ||
173 | #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) | 143 | #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) |
174 | #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) | 144 | #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) |
@@ -234,35 +204,29 @@ SECTIONS | |||
234 | 204 | ||
235 | #endif /* CONFIG_X86_64 */ | 205 | #endif /* CONFIG_X86_64 */ |
236 | 206 | ||
237 | /* init_task */ | 207 | /* Init code and data - will be freed after init */ |
238 | . = ALIGN(THREAD_SIZE); | 208 | . = ALIGN(PAGE_SIZE); |
239 | .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { | 209 | .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { |
240 | *(.data.init_task) | 210 | __init_begin = .; /* paired with __init_end */ |
241 | } | 211 | } |
242 | #ifdef CONFIG_X86_64 | ||
243 | :data.init | ||
244 | #endif | ||
245 | 212 | ||
213 | #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) | ||
246 | /* | 214 | /* |
247 | * smp_locks might be freed after init | 215 | * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the |
248 | * start/end must be page aligned | 216 | * output PHDR, so the next output section - .init.text - should |
217 | * start another segment - init. | ||
249 | */ | 218 | */ |
250 | . = ALIGN(PAGE_SIZE); | 219 | PERCPU_VADDR(0, :percpu) |
251 | .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { | 220 | #endif |
252 | __smp_locks = .; | ||
253 | *(.smp_locks) | ||
254 | __smp_locks_end = .; | ||
255 | . = ALIGN(PAGE_SIZE); | ||
256 | } | ||
257 | 221 | ||
258 | /* Init code and data - will be freed after init */ | ||
259 | . = ALIGN(PAGE_SIZE); | ||
260 | .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { | 222 | .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { |
261 | __init_begin = .; /* paired with __init_end */ | ||
262 | _sinittext = .; | 223 | _sinittext = .; |
263 | INIT_TEXT | 224 | INIT_TEXT |
264 | _einittext = .; | 225 | _einittext = .; |
265 | } | 226 | } |
227 | #ifdef CONFIG_X86_64 | ||
228 | :init | ||
229 | #endif | ||
266 | 230 | ||
267 | .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { | 231 | .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { |
268 | INIT_DATA | 232 | INIT_DATA |
@@ -333,17 +297,7 @@ SECTIONS | |||
333 | } | 297 | } |
334 | #endif | 298 | #endif |
335 | 299 | ||
336 | #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) | 300 | #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) |
337 | /* | ||
338 | * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the | ||
339 | * output PHDR, so the next output section - __data_nosave - should | ||
340 | * start another section data.init2. Also, pda should be at the head of | ||
341 | * percpu area. Preallocate it and define the percpu offset symbol | ||
342 | * so that it can be accessed as a percpu variable. | ||
343 | */ | ||
344 | . = ALIGN(PAGE_SIZE); | ||
345 | PERCPU_VADDR(0, :percpu) | ||
346 | #else | ||
347 | PERCPU(PAGE_SIZE) | 301 | PERCPU(PAGE_SIZE) |
348 | #endif | 302 | #endif |
349 | 303 | ||
@@ -354,15 +308,22 @@ SECTIONS | |||
354 | __init_end = .; | 308 | __init_end = .; |
355 | } | 309 | } |
356 | 310 | ||
311 | /* | ||
312 | * smp_locks might be freed after init | ||
313 | * start/end must be page aligned | ||
314 | */ | ||
315 | . = ALIGN(PAGE_SIZE); | ||
316 | .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { | ||
317 | __smp_locks = .; | ||
318 | *(.smp_locks) | ||
319 | __smp_locks_end = .; | ||
320 | . = ALIGN(PAGE_SIZE); | ||
321 | } | ||
322 | |||
357 | #ifdef CONFIG_X86_64 | 323 | #ifdef CONFIG_X86_64 |
358 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { | 324 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { |
359 | . = ALIGN(PAGE_SIZE); | 325 | NOSAVE_DATA |
360 | __nosave_begin = .; | 326 | } |
361 | *(.data.nosave) | ||
362 | . = ALIGN(PAGE_SIZE); | ||
363 | __nosave_end = .; | ||
364 | } :data.init2 | ||
365 | /* use another section data.init2, see PERCPU_VADDR() above */ | ||
366 | #endif | 327 | #endif |
367 | 328 | ||
368 | /* BSS */ | 329 | /* BSS */ |
@@ -400,8 +361,8 @@ SECTIONS | |||
400 | 361 | ||
401 | 362 | ||
402 | #ifdef CONFIG_X86_32 | 363 | #ifdef CONFIG_X86_32 |
403 | ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), | 364 | . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), |
404 | "kernel image bigger than KERNEL_IMAGE_SIZE") | 365 | "kernel image bigger than KERNEL_IMAGE_SIZE"); |
405 | #else | 366 | #else |
406 | /* | 367 | /* |
407 | * Per-cpu symbols which need to be offset from __per_cpu_load | 368 | * Per-cpu symbols which need to be offset from __per_cpu_load |
@@ -414,12 +375,12 @@ INIT_PER_CPU(irq_stack_union); | |||
414 | /* | 375 | /* |
415 | * Build-time check on the image size: | 376 | * Build-time check on the image size: |
416 | */ | 377 | */ |
417 | ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), | 378 | . = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), |
418 | "kernel image bigger than KERNEL_IMAGE_SIZE") | 379 | "kernel image bigger than KERNEL_IMAGE_SIZE"); |
419 | 380 | ||
420 | #ifdef CONFIG_SMP | 381 | #ifdef CONFIG_SMP |
421 | ASSERT((per_cpu__irq_stack_union == 0), | 382 | . = ASSERT((per_cpu__irq_stack_union == 0), |
422 | "irq_stack_union is not at start of per-cpu area"); | 383 | "irq_stack_union is not at start of per-cpu area"); |
423 | #endif | 384 | #endif |
424 | 385 | ||
425 | #endif /* CONFIG_X86_32 */ | 386 | #endif /* CONFIG_X86_32 */ |
@@ -427,7 +388,7 @@ ASSERT((per_cpu__irq_stack_union == 0), | |||
427 | #ifdef CONFIG_KEXEC | 388 | #ifdef CONFIG_KEXEC |
428 | #include <asm/kexec.h> | 389 | #include <asm/kexec.h> |
429 | 390 | ||
430 | ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, | 391 | . = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, |
431 | "kexec control code size is too big") | 392 | "kexec control code size is too big"); |
432 | #endif | 393 | #endif |
433 | 394 | ||
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 4d6f0d293ee2..21f68e00524f 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -104,6 +104,9 @@ static s64 __kpit_elapsed(struct kvm *kvm) | |||
104 | ktime_t remaining; | 104 | ktime_t remaining; |
105 | struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; | 105 | struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; |
106 | 106 | ||
107 | if (!ps->pit_timer.period) | ||
108 | return 0; | ||
109 | |||
107 | /* | 110 | /* |
108 | * The Counter does not stop when it reaches zero. In | 111 | * The Counter does not stop when it reaches zero. In |
109 | * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to | 112 | * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7030b5f911bf..0ef5bb2b4043 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -489,16 +489,20 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) | |||
489 | * | 489 | * |
490 | * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc | 490 | * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc |
491 | * containing more mappings. | 491 | * containing more mappings. |
492 | * | ||
493 | * Returns the number of rmap entries before the spte was added or zero if | ||
494 | * the spte was not added. | ||
495 | * | ||
492 | */ | 496 | */ |
493 | static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) | 497 | static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) |
494 | { | 498 | { |
495 | struct kvm_mmu_page *sp; | 499 | struct kvm_mmu_page *sp; |
496 | struct kvm_rmap_desc *desc; | 500 | struct kvm_rmap_desc *desc; |
497 | unsigned long *rmapp; | 501 | unsigned long *rmapp; |
498 | int i; | 502 | int i, count = 0; |
499 | 503 | ||
500 | if (!is_rmap_pte(*spte)) | 504 | if (!is_rmap_pte(*spte)) |
501 | return; | 505 | return count; |
502 | gfn = unalias_gfn(vcpu->kvm, gfn); | 506 | gfn = unalias_gfn(vcpu->kvm, gfn); |
503 | sp = page_header(__pa(spte)); | 507 | sp = page_header(__pa(spte)); |
504 | sp->gfns[spte - sp->spt] = gfn; | 508 | sp->gfns[spte - sp->spt] = gfn; |
@@ -515,8 +519,10 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) | |||
515 | } else { | 519 | } else { |
516 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | 520 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); |
517 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 521 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); |
518 | while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) | 522 | while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) { |
519 | desc = desc->more; | 523 | desc = desc->more; |
524 | count += RMAP_EXT; | ||
525 | } | ||
520 | if (desc->shadow_ptes[RMAP_EXT-1]) { | 526 | if (desc->shadow_ptes[RMAP_EXT-1]) { |
521 | desc->more = mmu_alloc_rmap_desc(vcpu); | 527 | desc->more = mmu_alloc_rmap_desc(vcpu); |
522 | desc = desc->more; | 528 | desc = desc->more; |
@@ -525,6 +531,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) | |||
525 | ; | 531 | ; |
526 | desc->shadow_ptes[i] = spte; | 532 | desc->shadow_ptes[i] = spte; |
527 | } | 533 | } |
534 | return count; | ||
528 | } | 535 | } |
529 | 536 | ||
530 | static void rmap_desc_remove_entry(unsigned long *rmapp, | 537 | static void rmap_desc_remove_entry(unsigned long *rmapp, |
@@ -754,6 +761,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) | |||
754 | return young; | 761 | return young; |
755 | } | 762 | } |
756 | 763 | ||
764 | #define RMAP_RECYCLE_THRESHOLD 1000 | ||
765 | |||
766 | static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage) | ||
767 | { | ||
768 | unsigned long *rmapp; | ||
769 | |||
770 | gfn = unalias_gfn(vcpu->kvm, gfn); | ||
771 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); | ||
772 | |||
773 | kvm_unmap_rmapp(vcpu->kvm, rmapp); | ||
774 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
775 | } | ||
776 | |||
757 | int kvm_age_hva(struct kvm *kvm, unsigned long hva) | 777 | int kvm_age_hva(struct kvm *kvm, unsigned long hva) |
758 | { | 778 | { |
759 | return kvm_handle_hva(kvm, hva, kvm_age_rmapp); | 779 | return kvm_handle_hva(kvm, hva, kvm_age_rmapp); |
@@ -1407,24 +1427,25 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
1407 | */ | 1427 | */ |
1408 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) | 1428 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) |
1409 | { | 1429 | { |
1430 | int used_pages; | ||
1431 | |||
1432 | used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; | ||
1433 | used_pages = max(0, used_pages); | ||
1434 | |||
1410 | /* | 1435 | /* |
1411 | * If we set the number of mmu pages to be smaller be than the | 1436 | * If we set the number of mmu pages to be smaller be than the |
1412 | * number of actived pages , we must to free some mmu pages before we | 1437 | * number of actived pages , we must to free some mmu pages before we |
1413 | * change the value | 1438 | * change the value |
1414 | */ | 1439 | */ |
1415 | 1440 | ||
1416 | if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > | 1441 | if (used_pages > kvm_nr_mmu_pages) { |
1417 | kvm_nr_mmu_pages) { | 1442 | while (used_pages > kvm_nr_mmu_pages) { |
1418 | int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages | ||
1419 | - kvm->arch.n_free_mmu_pages; | ||
1420 | |||
1421 | while (n_used_mmu_pages > kvm_nr_mmu_pages) { | ||
1422 | struct kvm_mmu_page *page; | 1443 | struct kvm_mmu_page *page; |
1423 | 1444 | ||
1424 | page = container_of(kvm->arch.active_mmu_pages.prev, | 1445 | page = container_of(kvm->arch.active_mmu_pages.prev, |
1425 | struct kvm_mmu_page, link); | 1446 | struct kvm_mmu_page, link); |
1426 | kvm_mmu_zap_page(kvm, page); | 1447 | kvm_mmu_zap_page(kvm, page); |
1427 | n_used_mmu_pages--; | 1448 | used_pages--; |
1428 | } | 1449 | } |
1429 | kvm->arch.n_free_mmu_pages = 0; | 1450 | kvm->arch.n_free_mmu_pages = 0; |
1430 | } | 1451 | } |
@@ -1740,6 +1761,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1740 | { | 1761 | { |
1741 | int was_rmapped = 0; | 1762 | int was_rmapped = 0; |
1742 | int was_writeble = is_writeble_pte(*shadow_pte); | 1763 | int was_writeble = is_writeble_pte(*shadow_pte); |
1764 | int rmap_count; | ||
1743 | 1765 | ||
1744 | pgprintk("%s: spte %llx access %x write_fault %d" | 1766 | pgprintk("%s: spte %llx access %x write_fault %d" |
1745 | " user_fault %d gfn %lx\n", | 1767 | " user_fault %d gfn %lx\n", |
@@ -1781,9 +1803,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1781 | 1803 | ||
1782 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); | 1804 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); |
1783 | if (!was_rmapped) { | 1805 | if (!was_rmapped) { |
1784 | rmap_add(vcpu, shadow_pte, gfn, largepage); | 1806 | rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage); |
1785 | if (!is_rmap_pte(*shadow_pte)) | 1807 | if (!is_rmap_pte(*shadow_pte)) |
1786 | kvm_release_pfn_clean(pfn); | 1808 | kvm_release_pfn_clean(pfn); |
1809 | if (rmap_count > RMAP_RECYCLE_THRESHOLD) | ||
1810 | rmap_recycle(vcpu, gfn, largepage); | ||
1787 | } else { | 1811 | } else { |
1788 | if (was_writeble) | 1812 | if (was_writeble) |
1789 | kvm_release_pfn_dirty(pfn); | 1813 | kvm_release_pfn_dirty(pfn); |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 71510e07e69e..b1f658ad2f06 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -711,6 +711,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
711 | svm->vmcb->control.tsc_offset += delta; | 711 | svm->vmcb->control.tsc_offset += delta; |
712 | vcpu->cpu = cpu; | 712 | vcpu->cpu = cpu; |
713 | kvm_migrate_timers(vcpu); | 713 | kvm_migrate_timers(vcpu); |
714 | svm->asid_generation = 0; | ||
714 | } | 715 | } |
715 | 716 | ||
716 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | 717 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) |
@@ -1031,7 +1032,6 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) | |||
1031 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; | 1032 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; |
1032 | } | 1033 | } |
1033 | 1034 | ||
1034 | svm->vcpu.cpu = svm_data->cpu; | ||
1035 | svm->asid_generation = svm_data->asid_generation; | 1035 | svm->asid_generation = svm_data->asid_generation; |
1036 | svm->vmcb->control.asid = svm_data->next_asid++; | 1036 | svm->vmcb->control.asid = svm_data->next_asid++; |
1037 | } | 1037 | } |
@@ -2300,8 +2300,8 @@ static void pre_svm_run(struct vcpu_svm *svm) | |||
2300 | struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); | 2300 | struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); |
2301 | 2301 | ||
2302 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; | 2302 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; |
2303 | if (svm->vcpu.cpu != cpu || | 2303 | /* FIXME: handle wraparound of asid_generation */ |
2304 | svm->asid_generation != svm_data->asid_generation) | 2304 | if (svm->asid_generation != svm_data->asid_generation) |
2305 | new_asid(svm, svm_data); | 2305 | new_asid(svm, svm_data); |
2306 | } | 2306 | } |
2307 | 2307 | ||
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 356a0ce85c68..29f912927a58 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -3157,8 +3157,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, | |||
3157 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3157 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3158 | enum emulation_result err = EMULATE_DONE; | 3158 | enum emulation_result err = EMULATE_DONE; |
3159 | 3159 | ||
3160 | preempt_enable(); | ||
3161 | local_irq_enable(); | 3160 | local_irq_enable(); |
3161 | preempt_enable(); | ||
3162 | 3162 | ||
3163 | while (!guest_state_valid(vcpu)) { | 3163 | while (!guest_state_valid(vcpu)) { |
3164 | err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); | 3164 | err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); |
@@ -3168,7 +3168,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, | |||
3168 | 3168 | ||
3169 | if (err != EMULATE_DONE) { | 3169 | if (err != EMULATE_DONE) { |
3170 | kvm_report_emulation_failure(vcpu, "emulation failure"); | 3170 | kvm_report_emulation_failure(vcpu, "emulation failure"); |
3171 | return; | 3171 | break; |
3172 | } | 3172 | } |
3173 | 3173 | ||
3174 | if (signal_pending(current)) | 3174 | if (signal_pending(current)) |
@@ -3177,8 +3177,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, | |||
3177 | schedule(); | 3177 | schedule(); |
3178 | } | 3178 | } |
3179 | 3179 | ||
3180 | local_irq_disable(); | ||
3181 | preempt_disable(); | 3180 | preempt_disable(); |
3181 | local_irq_disable(); | ||
3182 | 3182 | ||
3183 | vmx->invalid_state_emulation_result = err; | 3183 | vmx->invalid_state_emulation_result = err; |
3184 | } | 3184 | } |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fe5474aec41a..3d4529011828 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -704,11 +704,48 @@ static bool msr_mtrr_valid(unsigned msr) | |||
704 | return false; | 704 | return false; |
705 | } | 705 | } |
706 | 706 | ||
707 | static bool valid_pat_type(unsigned t) | ||
708 | { | ||
709 | return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ | ||
710 | } | ||
711 | |||
712 | static bool valid_mtrr_type(unsigned t) | ||
713 | { | ||
714 | return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ | ||
715 | } | ||
716 | |||
717 | static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) | ||
718 | { | ||
719 | int i; | ||
720 | |||
721 | if (!msr_mtrr_valid(msr)) | ||
722 | return false; | ||
723 | |||
724 | if (msr == MSR_IA32_CR_PAT) { | ||
725 | for (i = 0; i < 8; i++) | ||
726 | if (!valid_pat_type((data >> (i * 8)) & 0xff)) | ||
727 | return false; | ||
728 | return true; | ||
729 | } else if (msr == MSR_MTRRdefType) { | ||
730 | if (data & ~0xcff) | ||
731 | return false; | ||
732 | return valid_mtrr_type(data & 0xff); | ||
733 | } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { | ||
734 | for (i = 0; i < 8 ; i++) | ||
735 | if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) | ||
736 | return false; | ||
737 | return true; | ||
738 | } | ||
739 | |||
740 | /* variable MTRRs */ | ||
741 | return valid_mtrr_type(data & 0xff); | ||
742 | } | ||
743 | |||
707 | static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) | 744 | static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) |
708 | { | 745 | { |
709 | u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; | 746 | u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; |
710 | 747 | ||
711 | if (!msr_mtrr_valid(msr)) | 748 | if (!mtrr_valid(vcpu, msr, data)) |
712 | return 1; | 749 | return 1; |
713 | 750 | ||
714 | if (msr == MSR_MTRRdefType) { | 751 | if (msr == MSR_MTRRdefType) { |
@@ -1079,14 +1116,13 @@ long kvm_arch_dev_ioctl(struct file *filp, | |||
1079 | if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) | 1116 | if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) |
1080 | goto out; | 1117 | goto out; |
1081 | r = -E2BIG; | 1118 | r = -E2BIG; |
1082 | if (n < num_msrs_to_save) | 1119 | if (n < msr_list.nmsrs) |
1083 | goto out; | 1120 | goto out; |
1084 | r = -EFAULT; | 1121 | r = -EFAULT; |
1085 | if (copy_to_user(user_msr_list->indices, &msrs_to_save, | 1122 | if (copy_to_user(user_msr_list->indices, &msrs_to_save, |
1086 | num_msrs_to_save * sizeof(u32))) | 1123 | num_msrs_to_save * sizeof(u32))) |
1087 | goto out; | 1124 | goto out; |
1088 | if (copy_to_user(user_msr_list->indices | 1125 | if (copy_to_user(user_msr_list->indices + num_msrs_to_save, |
1089 | + num_msrs_to_save * sizeof(u32), | ||
1090 | &emulated_msrs, | 1126 | &emulated_msrs, |
1091 | ARRAY_SIZE(emulated_msrs) * sizeof(u32))) | 1127 | ARRAY_SIZE(emulated_msrs) * sizeof(u32))) |
1092 | goto out; | 1128 | goto out; |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index f2bf1f73d468..d677fa9ca650 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -22,7 +22,8 @@ | |||
22 | * | 22 | * |
23 | * So how does the kernel know it's a Guest? We'll see that later, but let's | 23 | * So how does the kernel know it's a Guest? We'll see that later, but let's |
24 | * just say that we end up here where we replace the native functions various | 24 | * just say that we end up here where we replace the native functions various |
25 | * "paravirt" structures with our Guest versions, then boot like normal. :*/ | 25 | * "paravirt" structures with our Guest versions, then boot like normal. |
26 | :*/ | ||
26 | 27 | ||
27 | /* | 28 | /* |
28 | * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. | 29 | * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. |
@@ -74,7 +75,8 @@ | |||
74 | * | 75 | * |
75 | * The Guest in our tale is a simple creature: identical to the Host but | 76 | * The Guest in our tale is a simple creature: identical to the Host but |
76 | * behaving in simplified but equivalent ways. In particular, the Guest is the | 77 | * behaving in simplified but equivalent ways. In particular, the Guest is the |
77 | * same kernel as the Host (or at least, built from the same source code). :*/ | 78 | * same kernel as the Host (or at least, built from the same source code). |
79 | :*/ | ||
78 | 80 | ||
79 | struct lguest_data lguest_data = { | 81 | struct lguest_data lguest_data = { |
80 | .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, | 82 | .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, |
@@ -85,7 +87,8 @@ struct lguest_data lguest_data = { | |||
85 | .syscall_vec = SYSCALL_VECTOR, | 87 | .syscall_vec = SYSCALL_VECTOR, |
86 | }; | 88 | }; |
87 | 89 | ||
88 | /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a | 90 | /*G:037 |
91 | * async_hcall() is pretty simple: I'm quite proud of it really. We have a | ||
89 | * ring buffer of stored hypercalls which the Host will run though next time we | 92 | * ring buffer of stored hypercalls which the Host will run though next time we |
90 | * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall | 93 | * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall |
91 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, | 94 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, |
@@ -94,7 +97,8 @@ struct lguest_data lguest_data = { | |||
94 | * If we come around to a slot which hasn't been finished, then the table is | 97 | * If we come around to a slot which hasn't been finished, then the table is |
95 | * full and we just make the hypercall directly. This has the nice side | 98 | * full and we just make the hypercall directly. This has the nice side |
96 | * effect of causing the Host to run all the stored calls in the ring buffer | 99 | * effect of causing the Host to run all the stored calls in the ring buffer |
97 | * which empties it for next time! */ | 100 | * which empties it for next time! |
101 | */ | ||
98 | static void async_hcall(unsigned long call, unsigned long arg1, | 102 | static void async_hcall(unsigned long call, unsigned long arg1, |
99 | unsigned long arg2, unsigned long arg3, | 103 | unsigned long arg2, unsigned long arg3, |
100 | unsigned long arg4) | 104 | unsigned long arg4) |
@@ -103,9 +107,11 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
103 | static unsigned int next_call; | 107 | static unsigned int next_call; |
104 | unsigned long flags; | 108 | unsigned long flags; |
105 | 109 | ||
106 | /* Disable interrupts if not already disabled: we don't want an | 110 | /* |
111 | * Disable interrupts if not already disabled: we don't want an | ||
107 | * interrupt handler making a hypercall while we're already doing | 112 | * interrupt handler making a hypercall while we're already doing |
108 | * one! */ | 113 | * one! |
114 | */ | ||
109 | local_irq_save(flags); | 115 | local_irq_save(flags); |
110 | if (lguest_data.hcall_status[next_call] != 0xFF) { | 116 | if (lguest_data.hcall_status[next_call] != 0xFF) { |
111 | /* Table full, so do normal hcall which will flush table. */ | 117 | /* Table full, so do normal hcall which will flush table. */ |
@@ -125,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
125 | local_irq_restore(flags); | 131 | local_irq_restore(flags); |
126 | } | 132 | } |
127 | 133 | ||
128 | /*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first | 134 | /*G:035 |
129 | * real optimization trick! | 135 | * Notice the lazy_hcall() above, rather than hcall(). This is our first real |
136 | * optimization trick! | ||
130 | * | 137 | * |
131 | * When lazy_mode is set, it means we're allowed to defer all hypercalls and do | 138 | * When lazy_mode is set, it means we're allowed to defer all hypercalls and do |
132 | * them as a batch when lazy_mode is eventually turned off. Because hypercalls | 139 | * them as a batch when lazy_mode is eventually turned off. Because hypercalls |
@@ -136,7 +143,8 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
136 | * lguest_leave_lazy_mode(). | 143 | * lguest_leave_lazy_mode(). |
137 | * | 144 | * |
138 | * So, when we're in lazy mode, we call async_hcall() to store the call for | 145 | * So, when we're in lazy mode, we call async_hcall() to store the call for |
139 | * future processing: */ | 146 | * future processing: |
147 | */ | ||
140 | static void lazy_hcall1(unsigned long call, | 148 | static void lazy_hcall1(unsigned long call, |
141 | unsigned long arg1) | 149 | unsigned long arg1) |
142 | { | 150 | { |
@@ -146,6 +154,7 @@ static void lazy_hcall1(unsigned long call, | |||
146 | async_hcall(call, arg1, 0, 0, 0); | 154 | async_hcall(call, arg1, 0, 0, 0); |
147 | } | 155 | } |
148 | 156 | ||
157 | /* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ | ||
149 | static void lazy_hcall2(unsigned long call, | 158 | static void lazy_hcall2(unsigned long call, |
150 | unsigned long arg1, | 159 | unsigned long arg1, |
151 | unsigned long arg2) | 160 | unsigned long arg2) |
@@ -181,8 +190,10 @@ static void lazy_hcall4(unsigned long call, | |||
181 | } | 190 | } |
182 | #endif | 191 | #endif |
183 | 192 | ||
184 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then | 193 | /*G:036 |
185 | * issue the do-nothing hypercall to flush any stored calls. */ | 194 | * When lazy mode is turned off reset the per-cpu lazy mode variable and then |
195 | * issue the do-nothing hypercall to flush any stored calls. | ||
196 | :*/ | ||
186 | static void lguest_leave_lazy_mmu_mode(void) | 197 | static void lguest_leave_lazy_mmu_mode(void) |
187 | { | 198 | { |
188 | kvm_hypercall0(LHCALL_FLUSH_ASYNC); | 199 | kvm_hypercall0(LHCALL_FLUSH_ASYNC); |
@@ -208,9 +219,11 @@ static void lguest_end_context_switch(struct task_struct *next) | |||
208 | * check there before it tries to deliver an interrupt. | 219 | * check there before it tries to deliver an interrupt. |
209 | */ | 220 | */ |
210 | 221 | ||
211 | /* save_flags() is expected to return the processor state (ie. "flags"). The | 222 | /* |
223 | * save_flags() is expected to return the processor state (ie. "flags"). The | ||
212 | * flags word contains all kind of stuff, but in practice Linux only cares | 224 | * flags word contains all kind of stuff, but in practice Linux only cares |
213 | * about the interrupt flag. Our "save_flags()" just returns that. */ | 225 | * about the interrupt flag. Our "save_flags()" just returns that. |
226 | */ | ||
214 | static unsigned long save_fl(void) | 227 | static unsigned long save_fl(void) |
215 | { | 228 | { |
216 | return lguest_data.irq_enabled; | 229 | return lguest_data.irq_enabled; |
@@ -222,13 +235,15 @@ static void irq_disable(void) | |||
222 | lguest_data.irq_enabled = 0; | 235 | lguest_data.irq_enabled = 0; |
223 | } | 236 | } |
224 | 237 | ||
225 | /* Let's pause a moment. Remember how I said these are called so often? | 238 | /* |
239 | * Let's pause a moment. Remember how I said these are called so often? | ||
226 | * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to | 240 | * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to |
227 | * break some rules. In particular, these functions are assumed to save their | 241 | * break some rules. In particular, these functions are assumed to save their |
228 | * own registers if they need to: normal C functions assume they can trash the | 242 | * own registers if they need to: normal C functions assume they can trash the |
229 | * eax register. To use normal C functions, we use | 243 | * eax register. To use normal C functions, we use |
230 | * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the | 244 | * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the |
231 | * C function, then restores it. */ | 245 | * C function, then restores it. |
246 | */ | ||
232 | PV_CALLEE_SAVE_REGS_THUNK(save_fl); | 247 | PV_CALLEE_SAVE_REGS_THUNK(save_fl); |
233 | PV_CALLEE_SAVE_REGS_THUNK(irq_disable); | 248 | PV_CALLEE_SAVE_REGS_THUNK(irq_disable); |
234 | /*:*/ | 249 | /*:*/ |
@@ -237,18 +252,18 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable); | |||
237 | extern void lg_irq_enable(void); | 252 | extern void lg_irq_enable(void); |
238 | extern void lg_restore_fl(unsigned long flags); | 253 | extern void lg_restore_fl(unsigned long flags); |
239 | 254 | ||
240 | /*M:003 Note that we don't check for outstanding interrupts when we re-enable | 255 | /*M:003 |
241 | * them (or when we unmask an interrupt). This seems to work for the moment, | 256 | * We could be more efficient in our checking of outstanding interrupts, rather |
242 | * since interrupts are rare and we'll just get the interrupt on the next timer | 257 | * than using a branch. One way would be to put the "irq_enabled" field in a |
243 | * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way | 258 | * page by itself, and have the Host write-protect it when an interrupt comes |
244 | * would be to put the "irq_enabled" field in a page by itself, and have the | 259 | * in when irqs are disabled. There will then be a page fault as soon as |
245 | * Host write-protect it when an interrupt comes in when irqs are disabled. | 260 | * interrupts are re-enabled. |
246 | * There will then be a page fault as soon as interrupts are re-enabled. | ||
247 | * | 261 | * |
248 | * A better method is to implement soft interrupt disable generally for x86: | 262 | * A better method is to implement soft interrupt disable generally for x86: |
249 | * instead of disabling interrupts, we set a flag. If an interrupt does come | 263 | * instead of disabling interrupts, we set a flag. If an interrupt does come |
250 | * in, we then disable them for real. This is uncommon, so we could simply use | 264 | * in, we then disable them for real. This is uncommon, so we could simply use |
251 | * a hypercall for interrupt control and not worry about efficiency. :*/ | 265 | * a hypercall for interrupt control and not worry about efficiency. |
266 | :*/ | ||
252 | 267 | ||
253 | /*G:034 | 268 | /*G:034 |
254 | * The Interrupt Descriptor Table (IDT). | 269 | * The Interrupt Descriptor Table (IDT). |
@@ -261,10 +276,12 @@ extern void lg_restore_fl(unsigned long flags); | |||
261 | static void lguest_write_idt_entry(gate_desc *dt, | 276 | static void lguest_write_idt_entry(gate_desc *dt, |
262 | int entrynum, const gate_desc *g) | 277 | int entrynum, const gate_desc *g) |
263 | { | 278 | { |
264 | /* The gate_desc structure is 8 bytes long: we hand it to the Host in | 279 | /* |
280 | * The gate_desc structure is 8 bytes long: we hand it to the Host in | ||
265 | * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors | 281 | * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors |
266 | * around like this; typesafety wasn't a big concern in Linux's early | 282 | * around like this; typesafety wasn't a big concern in Linux's early |
267 | * years. */ | 283 | * years. |
284 | */ | ||
268 | u32 *desc = (u32 *)g; | 285 | u32 *desc = (u32 *)g; |
269 | /* Keep the local copy up to date. */ | 286 | /* Keep the local copy up to date. */ |
270 | native_write_idt_entry(dt, entrynum, g); | 287 | native_write_idt_entry(dt, entrynum, g); |
@@ -272,9 +289,11 @@ static void lguest_write_idt_entry(gate_desc *dt, | |||
272 | kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); | 289 | kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); |
273 | } | 290 | } |
274 | 291 | ||
275 | /* Changing to a different IDT is very rare: we keep the IDT up-to-date every | 292 | /* |
293 | * Changing to a different IDT is very rare: we keep the IDT up-to-date every | ||
276 | * time it is written, so we can simply loop through all entries and tell the | 294 | * time it is written, so we can simply loop through all entries and tell the |
277 | * Host about them. */ | 295 | * Host about them. |
296 | */ | ||
278 | static void lguest_load_idt(const struct desc_ptr *desc) | 297 | static void lguest_load_idt(const struct desc_ptr *desc) |
279 | { | 298 | { |
280 | unsigned int i; | 299 | unsigned int i; |
@@ -305,9 +324,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc) | |||
305 | kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); | 324 | kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); |
306 | } | 325 | } |
307 | 326 | ||
308 | /* For a single GDT entry which changes, we do the lazy thing: alter our GDT, | 327 | /* |
328 | * For a single GDT entry which changes, we do the lazy thing: alter our GDT, | ||
309 | * then tell the Host to reload the entire thing. This operation is so rare | 329 | * then tell the Host to reload the entire thing. This operation is so rare |
310 | * that this naive implementation is reasonable. */ | 330 | * that this naive implementation is reasonable. |
331 | */ | ||
311 | static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, | 332 | static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, |
312 | const void *desc, int type) | 333 | const void *desc, int type) |
313 | { | 334 | { |
@@ -317,29 +338,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, | |||
317 | dt[entrynum].a, dt[entrynum].b); | 338 | dt[entrynum].a, dt[entrynum].b); |
318 | } | 339 | } |
319 | 340 | ||
320 | /* OK, I lied. There are three "thread local storage" GDT entries which change | 341 | /* |
342 | * OK, I lied. There are three "thread local storage" GDT entries which change | ||
321 | * on every context switch (these three entries are how glibc implements | 343 | * on every context switch (these three entries are how glibc implements |
322 | * __thread variables). So we have a hypercall specifically for this case. */ | 344 | * __thread variables). So we have a hypercall specifically for this case. |
345 | */ | ||
323 | static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) | 346 | static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) |
324 | { | 347 | { |
325 | /* There's one problem which normal hardware doesn't have: the Host | 348 | /* |
349 | * There's one problem which normal hardware doesn't have: the Host | ||
326 | * can't handle us removing entries we're currently using. So we clear | 350 | * can't handle us removing entries we're currently using. So we clear |
327 | * the GS register here: if it's needed it'll be reloaded anyway. */ | 351 | * the GS register here: if it's needed it'll be reloaded anyway. |
352 | */ | ||
328 | lazy_load_gs(0); | 353 | lazy_load_gs(0); |
329 | lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); | 354 | lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); |
330 | } | 355 | } |
331 | 356 | ||
332 | /*G:038 That's enough excitement for now, back to ploughing through each of | 357 | /*G:038 |
333 | * the different pv_ops structures (we're about 1/3 of the way through). | 358 | * That's enough excitement for now, back to ploughing through each of the |
359 | * different pv_ops structures (we're about 1/3 of the way through). | ||
334 | * | 360 | * |
335 | * This is the Local Descriptor Table, another weird Intel thingy. Linux only | 361 | * This is the Local Descriptor Table, another weird Intel thingy. Linux only |
336 | * uses this for some strange applications like Wine. We don't do anything | 362 | * uses this for some strange applications like Wine. We don't do anything |
337 | * here, so they'll get an informative and friendly Segmentation Fault. */ | 363 | * here, so they'll get an informative and friendly Segmentation Fault. |
364 | */ | ||
338 | static void lguest_set_ldt(const void *addr, unsigned entries) | 365 | static void lguest_set_ldt(const void *addr, unsigned entries) |
339 | { | 366 | { |
340 | } | 367 | } |
341 | 368 | ||
342 | /* This loads a GDT entry into the "Task Register": that entry points to a | 369 | /* |
370 | * This loads a GDT entry into the "Task Register": that entry points to a | ||
343 | * structure called the Task State Segment. Some comments scattered though the | 371 | * structure called the Task State Segment. Some comments scattered though the |
344 | * kernel code indicate that this used for task switching in ages past, along | 372 | * kernel code indicate that this used for task switching in ages past, along |
345 | * with blood sacrifice and astrology. | 373 | * with blood sacrifice and astrology. |
@@ -347,19 +375,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries) | |||
347 | * Now there's nothing interesting in here that we don't get told elsewhere. | 375 | * Now there's nothing interesting in here that we don't get told elsewhere. |
348 | * But the native version uses the "ltr" instruction, which makes the Host | 376 | * But the native version uses the "ltr" instruction, which makes the Host |
349 | * complain to the Guest about a Segmentation Fault and it'll oops. So we | 377 | * complain to the Guest about a Segmentation Fault and it'll oops. So we |
350 | * override the native version with a do-nothing version. */ | 378 | * override the native version with a do-nothing version. |
379 | */ | ||
351 | static void lguest_load_tr_desc(void) | 380 | static void lguest_load_tr_desc(void) |
352 | { | 381 | { |
353 | } | 382 | } |
354 | 383 | ||
355 | /* The "cpuid" instruction is a way of querying both the CPU identity | 384 | /* |
385 | * The "cpuid" instruction is a way of querying both the CPU identity | ||
356 | * (manufacturer, model, etc) and its features. It was introduced before the | 386 | * (manufacturer, model, etc) and its features. It was introduced before the |
357 | * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. | 387 | * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. |
358 | * As you might imagine, after a decade and a half this treatment, it is now a | 388 | * As you might imagine, after a decade and a half this treatment, it is now a |
359 | * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. | 389 | * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. |
360 | * | 390 | * |
361 | * This instruction even it has its own Wikipedia entry. The Wikipedia entry | 391 | * This instruction even it has its own Wikipedia entry. The Wikipedia entry |
362 | * has been translated into 4 languages. I am not making this up! | 392 | * has been translated into 5 languages. I am not making this up! |
363 | * | 393 | * |
364 | * We could get funky here and identify ourselves as "GenuineLguest", but | 394 | * We could get funky here and identify ourselves as "GenuineLguest", but |
365 | * instead we just use the real "cpuid" instruction. Then I pretty much turned | 395 | * instead we just use the real "cpuid" instruction. Then I pretty much turned |
@@ -371,7 +401,8 @@ static void lguest_load_tr_desc(void) | |||
371 | * Replacing the cpuid so we can turn features off is great for the kernel, but | 401 | * Replacing the cpuid so we can turn features off is great for the kernel, but |
372 | * anyone (including userspace) can just use the raw "cpuid" instruction and | 402 | * anyone (including userspace) can just use the raw "cpuid" instruction and |
373 | * the Host won't even notice since it isn't privileged. So we try not to get | 403 | * the Host won't even notice since it isn't privileged. So we try not to get |
374 | * too worked up about it. */ | 404 | * too worked up about it. |
405 | */ | ||
375 | static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | 406 | static void lguest_cpuid(unsigned int *ax, unsigned int *bx, |
376 | unsigned int *cx, unsigned int *dx) | 407 | unsigned int *cx, unsigned int *dx) |
377 | { | 408 | { |
@@ -379,43 +410,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
379 | 410 | ||
380 | native_cpuid(ax, bx, cx, dx); | 411 | native_cpuid(ax, bx, cx, dx); |
381 | switch (function) { | 412 | switch (function) { |
382 | case 0: /* ID and highest CPUID. Futureproof a little by sticking to | 413 | /* |
383 | * older ones. */ | 414 | * CPUID 0 gives the highest legal CPUID number (and the ID string). |
415 | * We futureproof our code a little by sticking to known CPUID values. | ||
416 | */ | ||
417 | case 0: | ||
384 | if (*ax > 5) | 418 | if (*ax > 5) |
385 | *ax = 5; | 419 | *ax = 5; |
386 | break; | 420 | break; |
387 | case 1: /* Basic feature request. */ | 421 | |
388 | /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ | 422 | /* |
423 | * CPUID 1 is a basic feature request. | ||
424 | * | ||
425 | * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 | ||
426 | * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. | ||
427 | */ | ||
428 | case 1: | ||
389 | *cx &= 0x00002201; | 429 | *cx &= 0x00002201; |
390 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ | ||
391 | *dx &= 0x07808151; | 430 | *dx &= 0x07808151; |
392 | /* The Host can do a nice optimization if it knows that the | 431 | /* |
432 | * The Host can do a nice optimization if it knows that the | ||
393 | * kernel mappings (addresses above 0xC0000000 or whatever | 433 | * kernel mappings (addresses above 0xC0000000 or whatever |
394 | * PAGE_OFFSET is set to) haven't changed. But Linux calls | 434 | * PAGE_OFFSET is set to) haven't changed. But Linux calls |
395 | * flush_tlb_user() for both user and kernel mappings unless | 435 | * flush_tlb_user() for both user and kernel mappings unless |
396 | * the Page Global Enable (PGE) feature bit is set. */ | 436 | * the Page Global Enable (PGE) feature bit is set. |
437 | */ | ||
397 | *dx |= 0x00002000; | 438 | *dx |= 0x00002000; |
398 | /* We also lie, and say we're family id 5. 6 or greater | 439 | /* |
440 | * We also lie, and say we're family id 5. 6 or greater | ||
399 | * leads to a rdmsr in early_init_intel which we can't handle. | 441 | * leads to a rdmsr in early_init_intel which we can't handle. |
400 | * Family ID is returned as bits 8-12 in ax. */ | 442 | * Family ID is returned as bits 8-12 in ax. |
443 | */ | ||
401 | *ax &= 0xFFFFF0FF; | 444 | *ax &= 0xFFFFF0FF; |
402 | *ax |= 0x00000500; | 445 | *ax |= 0x00000500; |
403 | break; | 446 | break; |
447 | /* | ||
448 | * 0x80000000 returns the highest Extended Function, so we futureproof | ||
449 | * like we do above by limiting it to known fields. | ||
450 | */ | ||
404 | case 0x80000000: | 451 | case 0x80000000: |
405 | /* Futureproof this a little: if they ask how much extended | ||
406 | * processor information there is, limit it to known fields. */ | ||
407 | if (*ax > 0x80000008) | 452 | if (*ax > 0x80000008) |
408 | *ax = 0x80000008; | 453 | *ax = 0x80000008; |
409 | break; | 454 | break; |
455 | |||
456 | /* | ||
457 | * PAE systems can mark pages as non-executable. Linux calls this the | ||
458 | * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced | ||
459 | * Virus Protection). We just switch turn if off here, since we don't | ||
460 | * support it. | ||
461 | */ | ||
410 | case 0x80000001: | 462 | case 0x80000001: |
411 | /* Here we should fix nx cap depending on host. */ | ||
412 | /* For this version of PAE, we just clear NX bit. */ | ||
413 | *dx &= ~(1 << 20); | 463 | *dx &= ~(1 << 20); |
414 | break; | 464 | break; |
415 | } | 465 | } |
416 | } | 466 | } |
417 | 467 | ||
418 | /* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. | 468 | /* |
469 | * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. | ||
419 | * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother | 470 | * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother |
420 | * it. The Host needs to know when the Guest wants to change them, so we have | 471 | * it. The Host needs to know when the Guest wants to change them, so we have |
421 | * a whole series of functions like read_cr0() and write_cr0(). | 472 | * a whole series of functions like read_cr0() and write_cr0(). |
@@ -430,7 +481,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
430 | * name like "FPUTRAP bit" be a little less cryptic? | 481 | * name like "FPUTRAP bit" be a little less cryptic? |
431 | * | 482 | * |
432 | * We store cr0 locally because the Host never changes it. The Guest sometimes | 483 | * We store cr0 locally because the Host never changes it. The Guest sometimes |
433 | * wants to read it and we'd prefer not to bother the Host unnecessarily. */ | 484 | * wants to read it and we'd prefer not to bother the Host unnecessarily. |
485 | */ | ||
434 | static unsigned long current_cr0; | 486 | static unsigned long current_cr0; |
435 | static void lguest_write_cr0(unsigned long val) | 487 | static void lguest_write_cr0(unsigned long val) |
436 | { | 488 | { |
@@ -443,18 +495,22 @@ static unsigned long lguest_read_cr0(void) | |||
443 | return current_cr0; | 495 | return current_cr0; |
444 | } | 496 | } |
445 | 497 | ||
446 | /* Intel provided a special instruction to clear the TS bit for people too cool | 498 | /* |
499 | * Intel provided a special instruction to clear the TS bit for people too cool | ||
447 | * to use write_cr0() to do it. This "clts" instruction is faster, because all | 500 | * to use write_cr0() to do it. This "clts" instruction is faster, because all |
448 | * the vowels have been optimized out. */ | 501 | * the vowels have been optimized out. |
502 | */ | ||
449 | static void lguest_clts(void) | 503 | static void lguest_clts(void) |
450 | { | 504 | { |
451 | lazy_hcall1(LHCALL_TS, 0); | 505 | lazy_hcall1(LHCALL_TS, 0); |
452 | current_cr0 &= ~X86_CR0_TS; | 506 | current_cr0 &= ~X86_CR0_TS; |
453 | } | 507 | } |
454 | 508 | ||
455 | /* cr2 is the virtual address of the last page fault, which the Guest only ever | 509 | /* |
510 | * cr2 is the virtual address of the last page fault, which the Guest only ever | ||
456 | * reads. The Host kindly writes this into our "struct lguest_data", so we | 511 | * reads. The Host kindly writes this into our "struct lguest_data", so we |
457 | * just read it out of there. */ | 512 | * just read it out of there. |
513 | */ | ||
458 | static unsigned long lguest_read_cr2(void) | 514 | static unsigned long lguest_read_cr2(void) |
459 | { | 515 | { |
460 | return lguest_data.cr2; | 516 | return lguest_data.cr2; |
@@ -463,10 +519,12 @@ static unsigned long lguest_read_cr2(void) | |||
463 | /* See lguest_set_pte() below. */ | 519 | /* See lguest_set_pte() below. */ |
464 | static bool cr3_changed = false; | 520 | static bool cr3_changed = false; |
465 | 521 | ||
466 | /* cr3 is the current toplevel pagetable page: the principle is the same as | 522 | /* |
523 | * cr3 is the current toplevel pagetable page: the principle is the same as | ||
467 | * cr0. Keep a local copy, and tell the Host when it changes. The only | 524 | * cr0. Keep a local copy, and tell the Host when it changes. The only |
468 | * difference is that our local copy is in lguest_data because the Host needs | 525 | * difference is that our local copy is in lguest_data because the Host needs |
469 | * to set it upon our initial hypercall. */ | 526 | * to set it upon our initial hypercall. |
527 | */ | ||
470 | static void lguest_write_cr3(unsigned long cr3) | 528 | static void lguest_write_cr3(unsigned long cr3) |
471 | { | 529 | { |
472 | lguest_data.pgdir = cr3; | 530 | lguest_data.pgdir = cr3; |
@@ -511,7 +569,7 @@ static void lguest_write_cr4(unsigned long val) | |||
511 | * cr3 ---> +---------+ | 569 | * cr3 ---> +---------+ |
512 | * | --------->+---------+ | 570 | * | --------->+---------+ |
513 | * | | | PADDR1 | | 571 | * | | | PADDR1 | |
514 | * Top-level | | PADDR2 | | 572 | * Mid-level | | PADDR2 | |
515 | * (PMD) page | | | | 573 | * (PMD) page | | | |
516 | * | | Lower-level | | 574 | * | | Lower-level | |
517 | * | | (PTE) page | | 575 | * | | (PTE) page | |
@@ -531,21 +589,62 @@ static void lguest_write_cr4(unsigned long val) | |||
531 | * Index into top Index into second Offset within page | 589 | * Index into top Index into second Offset within page |
532 | * page directory page pagetable page | 590 | * page directory page pagetable page |
533 | * | 591 | * |
534 | * The kernel spends a lot of time changing both the top-level page directory | 592 | * Now, unfortunately, this isn't the whole story: Intel added Physical Address |
535 | * and lower-level pagetable pages. The Guest doesn't know physical addresses, | 593 | * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). |
536 | * so while it maintains these page tables exactly like normal, it also needs | 594 | * These are held in 64-bit page table entries, so we can now only fit 512 |
537 | * to keep the Host informed whenever it makes a change: the Host will create | 595 | * entries in a page, and the neat three-level tree breaks down. |
538 | * the real page tables based on the Guests'. | 596 | * |
597 | * The result is a four level page table: | ||
598 | * | ||
599 | * cr3 --> [ 4 Upper ] | ||
600 | * [ Level ] | ||
601 | * [ Entries ] | ||
602 | * [(PUD Page)]---> +---------+ | ||
603 | * | --------->+---------+ | ||
604 | * | | | PADDR1 | | ||
605 | * Mid-level | | PADDR2 | | ||
606 | * (PMD) page | | | | ||
607 | * | | Lower-level | | ||
608 | * | | (PTE) page | | ||
609 | * | | | | | ||
610 | * .... .... | ||
611 | * | ||
612 | * | ||
613 | * And the virtual address is decoded as: | ||
614 | * | ||
615 | * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | ||
616 | * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| | ||
617 | * Index into Index into mid Index into lower Offset within page | ||
618 | * top entries directory page pagetable page | ||
619 | * | ||
620 | * It's too hard to switch between these two formats at runtime, so Linux only | ||
621 | * supports one or the other depending on whether CONFIG_X86_PAE is set. Many | ||
622 | * distributions turn it on, and not just for people with silly amounts of | ||
623 | * memory: the larger PTE entries allow room for the NX bit, which lets the | ||
624 | * kernel disable execution of pages and increase security. | ||
625 | * | ||
626 | * This was a problem for lguest, which couldn't run on these distributions; | ||
627 | * then Matias Zabaljauregui figured it all out and implemented it, and only a | ||
628 | * handful of puppies were crushed in the process! | ||
629 | * | ||
630 | * Back to our point: the kernel spends a lot of time changing both the | ||
631 | * top-level page directory and lower-level pagetable pages. The Guest doesn't | ||
632 | * know physical addresses, so while it maintains these page tables exactly | ||
633 | * like normal, it also needs to keep the Host informed whenever it makes a | ||
634 | * change: the Host will create the real page tables based on the Guests'. | ||
539 | */ | 635 | */ |
540 | 636 | ||
541 | /* The Guest calls this to set a second-level entry (pte), ie. to map a page | 637 | /* |
542 | * into a process' address space. We set the entry then tell the Host the | 638 | * The Guest calls this after it has set a second-level entry (pte), ie. to map |
543 | * toplevel and address this corresponds to. The Guest uses one pagetable per | 639 | * a page into a process' address space. Wetell the Host the toplevel and |
544 | * process, so we need to tell the Host which one we're changing (mm->pgd). */ | 640 | * address this corresponds to. The Guest uses one pagetable per process, so |
641 | * we need to tell the Host which one we're changing (mm->pgd). | ||
642 | */ | ||
545 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | 643 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, |
546 | pte_t *ptep) | 644 | pte_t *ptep) |
547 | { | 645 | { |
548 | #ifdef CONFIG_X86_PAE | 646 | #ifdef CONFIG_X86_PAE |
647 | /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ | ||
549 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, | 648 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, |
550 | ptep->pte_low, ptep->pte_high); | 649 | ptep->pte_low, ptep->pte_high); |
551 | #else | 650 | #else |
@@ -553,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | |||
553 | #endif | 652 | #endif |
554 | } | 653 | } |
555 | 654 | ||
655 | /* This is the "set and update" combo-meal-deal version. */ | ||
556 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | 656 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, |
557 | pte_t *ptep, pte_t pteval) | 657 | pte_t *ptep, pte_t pteval) |
558 | { | 658 | { |
@@ -560,10 +660,13 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
560 | lguest_pte_update(mm, addr, ptep); | 660 | lguest_pte_update(mm, addr, ptep); |
561 | } | 661 | } |
562 | 662 | ||
563 | /* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd | 663 | /* |
664 | * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd | ||
564 | * to set a middle-level entry when PAE is activated. | 665 | * to set a middle-level entry when PAE is activated. |
666 | * | ||
565 | * Again, we set the entry then tell the Host which page we changed, | 667 | * Again, we set the entry then tell the Host which page we changed, |
566 | * and the index of the entry we changed. */ | 668 | * and the index of the entry we changed. |
669 | */ | ||
567 | #ifdef CONFIG_X86_PAE | 670 | #ifdef CONFIG_X86_PAE |
568 | static void lguest_set_pud(pud_t *pudp, pud_t pudval) | 671 | static void lguest_set_pud(pud_t *pudp, pud_t pudval) |
569 | { | 672 | { |
@@ -582,8 +685,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
582 | } | 685 | } |
583 | #else | 686 | #else |
584 | 687 | ||
585 | /* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not | 688 | /* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ |
586 | * activated. */ | ||
587 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | 689 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) |
588 | { | 690 | { |
589 | native_set_pmd(pmdp, pmdval); | 691 | native_set_pmd(pmdp, pmdval); |
@@ -592,7 +694,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
592 | } | 694 | } |
593 | #endif | 695 | #endif |
594 | 696 | ||
595 | /* There are a couple of legacy places where the kernel sets a PTE, but we | 697 | /* |
698 | * There are a couple of legacy places where the kernel sets a PTE, but we | ||
596 | * don't know the top level any more. This is useless for us, since we don't | 699 | * don't know the top level any more. This is useless for us, since we don't |
597 | * know which pagetable is changing or what address, so we just tell the Host | 700 | * know which pagetable is changing or what address, so we just tell the Host |
598 | * to forget all of them. Fortunately, this is very rare. | 701 | * to forget all of them. Fortunately, this is very rare. |
@@ -600,7 +703,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
600 | * ... except in early boot when the kernel sets up the initial pagetables, | 703 | * ... except in early boot when the kernel sets up the initial pagetables, |
601 | * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell | 704 | * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell |
602 | * the Host anything changed until we've done the first page table switch, | 705 | * the Host anything changed until we've done the first page table switch, |
603 | * which brings boot back to 0.25 seconds. */ | 706 | * which brings boot back to 0.25 seconds. |
707 | */ | ||
604 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) | 708 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) |
605 | { | 709 | { |
606 | native_set_pte(ptep, pteval); | 710 | native_set_pte(ptep, pteval); |
@@ -609,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) | |||
609 | } | 713 | } |
610 | 714 | ||
611 | #ifdef CONFIG_X86_PAE | 715 | #ifdef CONFIG_X86_PAE |
716 | /* | ||
717 | * With 64-bit PTE values, we need to be careful setting them: if we set 32 | ||
718 | * bits at a time, the hardware could see a weird half-set entry. These | ||
719 | * versions ensure we update all 64 bits at once. | ||
720 | */ | ||
612 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | 721 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) |
613 | { | 722 | { |
614 | native_set_pte_atomic(ptep, pte); | 723 | native_set_pte_atomic(ptep, pte); |
@@ -616,19 +725,21 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | |||
616 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | 725 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); |
617 | } | 726 | } |
618 | 727 | ||
619 | void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | 728 | static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, |
729 | pte_t *ptep) | ||
620 | { | 730 | { |
621 | native_pte_clear(mm, addr, ptep); | 731 | native_pte_clear(mm, addr, ptep); |
622 | lguest_pte_update(mm, addr, ptep); | 732 | lguest_pte_update(mm, addr, ptep); |
623 | } | 733 | } |
624 | 734 | ||
625 | void lguest_pmd_clear(pmd_t *pmdp) | 735 | static void lguest_pmd_clear(pmd_t *pmdp) |
626 | { | 736 | { |
627 | lguest_set_pmd(pmdp, __pmd(0)); | 737 | lguest_set_pmd(pmdp, __pmd(0)); |
628 | } | 738 | } |
629 | #endif | 739 | #endif |
630 | 740 | ||
631 | /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on | 741 | /* |
742 | * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on | ||
632 | * native page table operations. On native hardware you can set a new page | 743 | * native page table operations. On native hardware you can set a new page |
633 | * table entry whenever you want, but if you want to remove one you have to do | 744 | * table entry whenever you want, but if you want to remove one you have to do |
634 | * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). | 745 | * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). |
@@ -637,24 +748,29 @@ void lguest_pmd_clear(pmd_t *pmdp) | |||
637 | * called when a valid entry is written, not when it's removed (ie. marked not | 748 | * called when a valid entry is written, not when it's removed (ie. marked not |
638 | * present). Instead, this is where we come when the Guest wants to remove a | 749 | * present). Instead, this is where we come when the Guest wants to remove a |
639 | * page table entry: we tell the Host to set that entry to 0 (ie. the present | 750 | * page table entry: we tell the Host to set that entry to 0 (ie. the present |
640 | * bit is zero). */ | 751 | * bit is zero). |
752 | */ | ||
641 | static void lguest_flush_tlb_single(unsigned long addr) | 753 | static void lguest_flush_tlb_single(unsigned long addr) |
642 | { | 754 | { |
643 | /* Simply set it to zero: if it was not, it will fault back in. */ | 755 | /* Simply set it to zero: if it was not, it will fault back in. */ |
644 | lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); | 756 | lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); |
645 | } | 757 | } |
646 | 758 | ||
647 | /* This is what happens after the Guest has removed a large number of entries. | 759 | /* |
760 | * This is what happens after the Guest has removed a large number of entries. | ||
648 | * This tells the Host that any of the page table entries for userspace might | 761 | * This tells the Host that any of the page table entries for userspace might |
649 | * have changed, ie. virtual addresses below PAGE_OFFSET. */ | 762 | * have changed, ie. virtual addresses below PAGE_OFFSET. |
763 | */ | ||
650 | static void lguest_flush_tlb_user(void) | 764 | static void lguest_flush_tlb_user(void) |
651 | { | 765 | { |
652 | lazy_hcall1(LHCALL_FLUSH_TLB, 0); | 766 | lazy_hcall1(LHCALL_FLUSH_TLB, 0); |
653 | } | 767 | } |
654 | 768 | ||
655 | /* This is called when the kernel page tables have changed. That's not very | 769 | /* |
770 | * This is called when the kernel page tables have changed. That's not very | ||
656 | * common (unless the Guest is using highmem, which makes the Guest extremely | 771 | * common (unless the Guest is using highmem, which makes the Guest extremely |
657 | * slow), so it's worth separating this from the user flushing above. */ | 772 | * slow), so it's worth separating this from the user flushing above. |
773 | */ | ||
658 | static void lguest_flush_tlb_kernel(void) | 774 | static void lguest_flush_tlb_kernel(void) |
659 | { | 775 | { |
660 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | 776 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); |
@@ -691,26 +807,38 @@ static struct irq_chip lguest_irq_controller = { | |||
691 | .unmask = enable_lguest_irq, | 807 | .unmask = enable_lguest_irq, |
692 | }; | 808 | }; |
693 | 809 | ||
694 | /* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware | 810 | /* |
811 | * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware | ||
695 | * interrupt (except 128, which is used for system calls), and then tells the | 812 | * interrupt (except 128, which is used for system calls), and then tells the |
696 | * Linux infrastructure that each interrupt is controlled by our level-based | 813 | * Linux infrastructure that each interrupt is controlled by our level-based |
697 | * lguest interrupt controller. */ | 814 | * lguest interrupt controller. |
815 | */ | ||
698 | static void __init lguest_init_IRQ(void) | 816 | static void __init lguest_init_IRQ(void) |
699 | { | 817 | { |
700 | unsigned int i; | 818 | unsigned int i; |
701 | 819 | ||
702 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { | 820 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { |
703 | /* Some systems map "vectors" to interrupts weirdly. Lguest has | 821 | /* Some systems map "vectors" to interrupts weirdly. Not us! */ |
704 | * a straightforward 1 to 1 mapping, so force that here. */ | ||
705 | __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; | 822 | __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; |
706 | if (i != SYSCALL_VECTOR) | 823 | if (i != SYSCALL_VECTOR) |
707 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); | 824 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); |
708 | } | 825 | } |
709 | /* This call is required to set up for 4k stacks, where we have | 826 | |
710 | * separate stacks for hard and soft interrupts. */ | 827 | /* |
828 | * This call is required to set up for 4k stacks, where we have | ||
829 | * separate stacks for hard and soft interrupts. | ||
830 | */ | ||
711 | irq_ctx_init(smp_processor_id()); | 831 | irq_ctx_init(smp_processor_id()); |
712 | } | 832 | } |
713 | 833 | ||
834 | /* | ||
835 | * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so | ||
836 | * rather than set them in lguest_init_IRQ we are called here every time an | ||
837 | * lguest device needs an interrupt. | ||
838 | * | ||
839 | * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should | ||
840 | * pass that up! | ||
841 | */ | ||
714 | void lguest_setup_irq(unsigned int irq) | 842 | void lguest_setup_irq(unsigned int irq) |
715 | { | 843 | { |
716 | irq_to_desc_alloc_node(irq, 0); | 844 | irq_to_desc_alloc_node(irq, 0); |
@@ -729,31 +857,39 @@ static unsigned long lguest_get_wallclock(void) | |||
729 | return lguest_data.time.tv_sec; | 857 | return lguest_data.time.tv_sec; |
730 | } | 858 | } |
731 | 859 | ||
732 | /* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us | 860 | /* |
861 | * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us | ||
733 | * what speed it runs at, or 0 if it's unusable as a reliable clock source. | 862 | * what speed it runs at, or 0 if it's unusable as a reliable clock source. |
734 | * This matches what we want here: if we return 0 from this function, the x86 | 863 | * This matches what we want here: if we return 0 from this function, the x86 |
735 | * TSC clock will give up and not register itself. */ | 864 | * TSC clock will give up and not register itself. |
865 | */ | ||
736 | static unsigned long lguest_tsc_khz(void) | 866 | static unsigned long lguest_tsc_khz(void) |
737 | { | 867 | { |
738 | return lguest_data.tsc_khz; | 868 | return lguest_data.tsc_khz; |
739 | } | 869 | } |
740 | 870 | ||
741 | /* If we can't use the TSC, the kernel falls back to our lower-priority | 871 | /* |
742 | * "lguest_clock", where we read the time value given to us by the Host. */ | 872 | * If we can't use the TSC, the kernel falls back to our lower-priority |
873 | * "lguest_clock", where we read the time value given to us by the Host. | ||
874 | */ | ||
743 | static cycle_t lguest_clock_read(struct clocksource *cs) | 875 | static cycle_t lguest_clock_read(struct clocksource *cs) |
744 | { | 876 | { |
745 | unsigned long sec, nsec; | 877 | unsigned long sec, nsec; |
746 | 878 | ||
747 | /* Since the time is in two parts (seconds and nanoseconds), we risk | 879 | /* |
880 | * Since the time is in two parts (seconds and nanoseconds), we risk | ||
748 | * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, | 881 | * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, |
749 | * and getting 99 and 0. As Linux tends to come apart under the stress | 882 | * and getting 99 and 0. As Linux tends to come apart under the stress |
750 | * of time travel, we must be careful: */ | 883 | * of time travel, we must be careful: |
884 | */ | ||
751 | do { | 885 | do { |
752 | /* First we read the seconds part. */ | 886 | /* First we read the seconds part. */ |
753 | sec = lguest_data.time.tv_sec; | 887 | sec = lguest_data.time.tv_sec; |
754 | /* This read memory barrier tells the compiler and the CPU that | 888 | /* |
889 | * This read memory barrier tells the compiler and the CPU that | ||
755 | * this can't be reordered: we have to complete the above | 890 | * this can't be reordered: we have to complete the above |
756 | * before going on. */ | 891 | * before going on. |
892 | */ | ||
757 | rmb(); | 893 | rmb(); |
758 | /* Now we read the nanoseconds part. */ | 894 | /* Now we read the nanoseconds part. */ |
759 | nsec = lguest_data.time.tv_nsec; | 895 | nsec = lguest_data.time.tv_nsec; |
@@ -777,9 +913,11 @@ static struct clocksource lguest_clock = { | |||
777 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | 913 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
778 | }; | 914 | }; |
779 | 915 | ||
780 | /* We also need a "struct clock_event_device": Linux asks us to set it to go | 916 | /* |
917 | * We also need a "struct clock_event_device": Linux asks us to set it to go | ||
781 | * off some time in the future. Actually, James Morris figured all this out, I | 918 | * off some time in the future. Actually, James Morris figured all this out, I |
782 | * just applied the patch. */ | 919 | * just applied the patch. |
920 | */ | ||
783 | static int lguest_clockevent_set_next_event(unsigned long delta, | 921 | static int lguest_clockevent_set_next_event(unsigned long delta, |
784 | struct clock_event_device *evt) | 922 | struct clock_event_device *evt) |
785 | { | 923 | { |
@@ -829,8 +967,10 @@ static struct clock_event_device lguest_clockevent = { | |||
829 | .max_delta_ns = LG_CLOCK_MAX_DELTA, | 967 | .max_delta_ns = LG_CLOCK_MAX_DELTA, |
830 | }; | 968 | }; |
831 | 969 | ||
832 | /* This is the Guest timer interrupt handler (hardware interrupt 0). We just | 970 | /* |
833 | * call the clockevent infrastructure and it does whatever needs doing. */ | 971 | * This is the Guest timer interrupt handler (hardware interrupt 0). We just |
972 | * call the clockevent infrastructure and it does whatever needs doing. | ||
973 | */ | ||
834 | static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) | 974 | static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) |
835 | { | 975 | { |
836 | unsigned long flags; | 976 | unsigned long flags; |
@@ -841,10 +981,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) | |||
841 | local_irq_restore(flags); | 981 | local_irq_restore(flags); |
842 | } | 982 | } |
843 | 983 | ||
844 | /* At some point in the boot process, we get asked to set up our timing | 984 | /* |
985 | * At some point in the boot process, we get asked to set up our timing | ||
845 | * infrastructure. The kernel doesn't expect timer interrupts before this, but | 986 | * infrastructure. The kernel doesn't expect timer interrupts before this, but |
846 | * we cleverly initialized the "blocked_interrupts" field of "struct | 987 | * we cleverly initialized the "blocked_interrupts" field of "struct |
847 | * lguest_data" so that timer interrupts were blocked until now. */ | 988 | * lguest_data" so that timer interrupts were blocked until now. |
989 | */ | ||
848 | static void lguest_time_init(void) | 990 | static void lguest_time_init(void) |
849 | { | 991 | { |
850 | /* Set up the timer interrupt (0) to go to our simple timer routine */ | 992 | /* Set up the timer interrupt (0) to go to our simple timer routine */ |
@@ -868,14 +1010,16 @@ static void lguest_time_init(void) | |||
868 | * to work. They're pretty simple. | 1010 | * to work. They're pretty simple. |
869 | */ | 1011 | */ |
870 | 1012 | ||
871 | /* The Guest needs to tell the Host what stack it expects traps to use. For | 1013 | /* |
1014 | * The Guest needs to tell the Host what stack it expects traps to use. For | ||
872 | * native hardware, this is part of the Task State Segment mentioned above in | 1015 | * native hardware, this is part of the Task State Segment mentioned above in |
873 | * lguest_load_tr_desc(), but to help hypervisors there's this special call. | 1016 | * lguest_load_tr_desc(), but to help hypervisors there's this special call. |
874 | * | 1017 | * |
875 | * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data | 1018 | * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data |
876 | * segment), the privilege level (we're privilege level 1, the Host is 0 and | 1019 | * segment), the privilege level (we're privilege level 1, the Host is 0 and |
877 | * will not tolerate us trying to use that), the stack pointer, and the number | 1020 | * will not tolerate us trying to use that), the stack pointer, and the number |
878 | * of pages in the stack. */ | 1021 | * of pages in the stack. |
1022 | */ | ||
879 | static void lguest_load_sp0(struct tss_struct *tss, | 1023 | static void lguest_load_sp0(struct tss_struct *tss, |
880 | struct thread_struct *thread) | 1024 | struct thread_struct *thread) |
881 | { | 1025 | { |
@@ -889,7 +1033,8 @@ static void lguest_set_debugreg(int regno, unsigned long value) | |||
889 | /* FIXME: Implement */ | 1033 | /* FIXME: Implement */ |
890 | } | 1034 | } |
891 | 1035 | ||
892 | /* There are times when the kernel wants to make sure that no memory writes are | 1036 | /* |
1037 | * There are times when the kernel wants to make sure that no memory writes are | ||
893 | * caught in the cache (that they've all reached real hardware devices). This | 1038 | * caught in the cache (that they've all reached real hardware devices). This |
894 | * doesn't matter for the Guest which has virtual hardware. | 1039 | * doesn't matter for the Guest which has virtual hardware. |
895 | * | 1040 | * |
@@ -903,11 +1048,13 @@ static void lguest_wbinvd(void) | |||
903 | { | 1048 | { |
904 | } | 1049 | } |
905 | 1050 | ||
906 | /* If the Guest expects to have an Advanced Programmable Interrupt Controller, | 1051 | /* |
1052 | * If the Guest expects to have an Advanced Programmable Interrupt Controller, | ||
907 | * we play dumb by ignoring writes and returning 0 for reads. So it's no | 1053 | * we play dumb by ignoring writes and returning 0 for reads. So it's no |
908 | * longer Programmable nor Controlling anything, and I don't think 8 lines of | 1054 | * longer Programmable nor Controlling anything, and I don't think 8 lines of |
909 | * code qualifies for Advanced. It will also never interrupt anything. It | 1055 | * code qualifies for Advanced. It will also never interrupt anything. It |
910 | * does, however, allow us to get through the Linux boot code. */ | 1056 | * does, however, allow us to get through the Linux boot code. |
1057 | */ | ||
911 | #ifdef CONFIG_X86_LOCAL_APIC | 1058 | #ifdef CONFIG_X86_LOCAL_APIC |
912 | static void lguest_apic_write(u32 reg, u32 v) | 1059 | static void lguest_apic_write(u32 reg, u32 v) |
913 | { | 1060 | { |
@@ -956,11 +1103,13 @@ static void lguest_safe_halt(void) | |||
956 | kvm_hypercall0(LHCALL_HALT); | 1103 | kvm_hypercall0(LHCALL_HALT); |
957 | } | 1104 | } |
958 | 1105 | ||
959 | /* The SHUTDOWN hypercall takes a string to describe what's happening, and | 1106 | /* |
1107 | * The SHUTDOWN hypercall takes a string to describe what's happening, and | ||
960 | * an argument which says whether this to restart (reboot) the Guest or not. | 1108 | * an argument which says whether this to restart (reboot) the Guest or not. |
961 | * | 1109 | * |
962 | * Note that the Host always prefers that the Guest speak in physical addresses | 1110 | * Note that the Host always prefers that the Guest speak in physical addresses |
963 | * rather than virtual addresses, so we use __pa() here. */ | 1111 | * rather than virtual addresses, so we use __pa() here. |
1112 | */ | ||
964 | static void lguest_power_off(void) | 1113 | static void lguest_power_off(void) |
965 | { | 1114 | { |
966 | kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), | 1115 | kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), |
@@ -991,8 +1140,10 @@ static __init char *lguest_memory_setup(void) | |||
991 | * nice to move it back to lguest_init. Patch welcome... */ | 1140 | * nice to move it back to lguest_init. Patch welcome... */ |
992 | atomic_notifier_chain_register(&panic_notifier_list, &paniced); | 1141 | atomic_notifier_chain_register(&panic_notifier_list, &paniced); |
993 | 1142 | ||
994 | /* The Linux bootloader header contains an "e820" memory map: the | 1143 | /* |
995 | * Launcher populated the first entry with our memory limit. */ | 1144 | *The Linux bootloader header contains an "e820" memory map: the |
1145 | * Launcher populated the first entry with our memory limit. | ||
1146 | */ | ||
996 | e820_add_region(boot_params.e820_map[0].addr, | 1147 | e820_add_region(boot_params.e820_map[0].addr, |
997 | boot_params.e820_map[0].size, | 1148 | boot_params.e820_map[0].size, |
998 | boot_params.e820_map[0].type); | 1149 | boot_params.e820_map[0].type); |
@@ -1001,16 +1152,17 @@ static __init char *lguest_memory_setup(void) | |||
1001 | return "LGUEST"; | 1152 | return "LGUEST"; |
1002 | } | 1153 | } |
1003 | 1154 | ||
1004 | /* We will eventually use the virtio console device to produce console output, | 1155 | /* |
1156 | * We will eventually use the virtio console device to produce console output, | ||
1005 | * but before that is set up we use LHCALL_NOTIFY on normal memory to produce | 1157 | * but before that is set up we use LHCALL_NOTIFY on normal memory to produce |
1006 | * console output. */ | 1158 | * console output. |
1159 | */ | ||
1007 | static __init int early_put_chars(u32 vtermno, const char *buf, int count) | 1160 | static __init int early_put_chars(u32 vtermno, const char *buf, int count) |
1008 | { | 1161 | { |
1009 | char scratch[17]; | 1162 | char scratch[17]; |
1010 | unsigned int len = count; | 1163 | unsigned int len = count; |
1011 | 1164 | ||
1012 | /* We use a nul-terminated string, so we have to make a copy. Icky, | 1165 | /* We use a nul-terminated string, so we make a copy. Icky, huh? */ |
1013 | * huh? */ | ||
1014 | if (len > sizeof(scratch) - 1) | 1166 | if (len > sizeof(scratch) - 1) |
1015 | len = sizeof(scratch) - 1; | 1167 | len = sizeof(scratch) - 1; |
1016 | scratch[len] = '\0'; | 1168 | scratch[len] = '\0'; |
@@ -1021,8 +1173,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) | |||
1021 | return len; | 1173 | return len; |
1022 | } | 1174 | } |
1023 | 1175 | ||
1024 | /* Rebooting also tells the Host we're finished, but the RESTART flag tells the | 1176 | /* |
1025 | * Launcher to reboot us. */ | 1177 | * Rebooting also tells the Host we're finished, but the RESTART flag tells the |
1178 | * Launcher to reboot us. | ||
1179 | */ | ||
1026 | static void lguest_restart(char *reason) | 1180 | static void lguest_restart(char *reason) |
1027 | { | 1181 | { |
1028 | kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); | 1182 | kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); |
@@ -1049,7 +1203,8 @@ static void lguest_restart(char *reason) | |||
1049 | * fit comfortably. | 1203 | * fit comfortably. |
1050 | * | 1204 | * |
1051 | * First we need assembly templates of each of the patchable Guest operations, | 1205 | * First we need assembly templates of each of the patchable Guest operations, |
1052 | * and these are in i386_head.S. */ | 1206 | * and these are in i386_head.S. |
1207 | */ | ||
1053 | 1208 | ||
1054 | /*G:060 We construct a table from the assembler templates: */ | 1209 | /*G:060 We construct a table from the assembler templates: */ |
1055 | static const struct lguest_insns | 1210 | static const struct lguest_insns |
@@ -1060,9 +1215,11 @@ static const struct lguest_insns | |||
1060 | [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, | 1215 | [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, |
1061 | }; | 1216 | }; |
1062 | 1217 | ||
1063 | /* Now our patch routine is fairly simple (based on the native one in | 1218 | /* |
1219 | * Now our patch routine is fairly simple (based on the native one in | ||
1064 | * paravirt.c). If we have a replacement, we copy it in and return how much of | 1220 | * paravirt.c). If we have a replacement, we copy it in and return how much of |
1065 | * the available space we used. */ | 1221 | * the available space we used. |
1222 | */ | ||
1066 | static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, | 1223 | static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, |
1067 | unsigned long addr, unsigned len) | 1224 | unsigned long addr, unsigned len) |
1068 | { | 1225 | { |
@@ -1074,8 +1231,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, | |||
1074 | 1231 | ||
1075 | insn_len = lguest_insns[type].end - lguest_insns[type].start; | 1232 | insn_len = lguest_insns[type].end - lguest_insns[type].start; |
1076 | 1233 | ||
1077 | /* Similarly if we can't fit replacement (shouldn't happen, but let's | 1234 | /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ |
1078 | * be thorough). */ | ||
1079 | if (len < insn_len) | 1235 | if (len < insn_len) |
1080 | return paravirt_patch_default(type, clobber, ibuf, addr, len); | 1236 | return paravirt_patch_default(type, clobber, ibuf, addr, len); |
1081 | 1237 | ||
@@ -1084,22 +1240,28 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, | |||
1084 | return insn_len; | 1240 | return insn_len; |
1085 | } | 1241 | } |
1086 | 1242 | ||
1087 | /*G:029 Once we get to lguest_init(), we know we're a Guest. The various | 1243 | /*G:029 |
1244 | * Once we get to lguest_init(), we know we're a Guest. The various | ||
1088 | * pv_ops structures in the kernel provide points for (almost) every routine we | 1245 | * pv_ops structures in the kernel provide points for (almost) every routine we |
1089 | * have to override to avoid privileged instructions. */ | 1246 | * have to override to avoid privileged instructions. |
1247 | */ | ||
1090 | __init void lguest_init(void) | 1248 | __init void lguest_init(void) |
1091 | { | 1249 | { |
1092 | /* We're under lguest, paravirt is enabled, and we're running at | 1250 | /* We're under lguest. */ |
1093 | * privilege level 1, not 0 as normal. */ | ||
1094 | pv_info.name = "lguest"; | 1251 | pv_info.name = "lguest"; |
1252 | /* Paravirt is enabled. */ | ||
1095 | pv_info.paravirt_enabled = 1; | 1253 | pv_info.paravirt_enabled = 1; |
1254 | /* We're running at privilege level 1, not 0 as normal. */ | ||
1096 | pv_info.kernel_rpl = 1; | 1255 | pv_info.kernel_rpl = 1; |
1256 | /* Everyone except Xen runs with this set. */ | ||
1097 | pv_info.shared_kernel_pmd = 1; | 1257 | pv_info.shared_kernel_pmd = 1; |
1098 | 1258 | ||
1099 | /* We set up all the lguest overrides for sensitive operations. These | 1259 | /* |
1100 | * are detailed with the operations themselves. */ | 1260 | * We set up all the lguest overrides for sensitive operations. These |
1261 | * are detailed with the operations themselves. | ||
1262 | */ | ||
1101 | 1263 | ||
1102 | /* interrupt-related operations */ | 1264 | /* Interrupt-related operations */ |
1103 | pv_irq_ops.init_IRQ = lguest_init_IRQ; | 1265 | pv_irq_ops.init_IRQ = lguest_init_IRQ; |
1104 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); | 1266 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); |
1105 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); | 1267 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); |
@@ -1107,11 +1269,11 @@ __init void lguest_init(void) | |||
1107 | pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); | 1269 | pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); |
1108 | pv_irq_ops.safe_halt = lguest_safe_halt; | 1270 | pv_irq_ops.safe_halt = lguest_safe_halt; |
1109 | 1271 | ||
1110 | /* init-time operations */ | 1272 | /* Setup operations */ |
1111 | pv_init_ops.memory_setup = lguest_memory_setup; | 1273 | pv_init_ops.memory_setup = lguest_memory_setup; |
1112 | pv_init_ops.patch = lguest_patch; | 1274 | pv_init_ops.patch = lguest_patch; |
1113 | 1275 | ||
1114 | /* Intercepts of various cpu instructions */ | 1276 | /* Intercepts of various CPU instructions */ |
1115 | pv_cpu_ops.load_gdt = lguest_load_gdt; | 1277 | pv_cpu_ops.load_gdt = lguest_load_gdt; |
1116 | pv_cpu_ops.cpuid = lguest_cpuid; | 1278 | pv_cpu_ops.cpuid = lguest_cpuid; |
1117 | pv_cpu_ops.load_idt = lguest_load_idt; | 1279 | pv_cpu_ops.load_idt = lguest_load_idt; |
@@ -1132,7 +1294,7 @@ __init void lguest_init(void) | |||
1132 | pv_cpu_ops.start_context_switch = paravirt_start_context_switch; | 1294 | pv_cpu_ops.start_context_switch = paravirt_start_context_switch; |
1133 | pv_cpu_ops.end_context_switch = lguest_end_context_switch; | 1295 | pv_cpu_ops.end_context_switch = lguest_end_context_switch; |
1134 | 1296 | ||
1135 | /* pagetable management */ | 1297 | /* Pagetable management */ |
1136 | pv_mmu_ops.write_cr3 = lguest_write_cr3; | 1298 | pv_mmu_ops.write_cr3 = lguest_write_cr3; |
1137 | pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; | 1299 | pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; |
1138 | pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; | 1300 | pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; |
@@ -1154,54 +1316,71 @@ __init void lguest_init(void) | |||
1154 | pv_mmu_ops.pte_update_defer = lguest_pte_update; | 1316 | pv_mmu_ops.pte_update_defer = lguest_pte_update; |
1155 | 1317 | ||
1156 | #ifdef CONFIG_X86_LOCAL_APIC | 1318 | #ifdef CONFIG_X86_LOCAL_APIC |
1157 | /* apic read/write intercepts */ | 1319 | /* APIC read/write intercepts */ |
1158 | set_lguest_basic_apic_ops(); | 1320 | set_lguest_basic_apic_ops(); |
1159 | #endif | 1321 | #endif |
1160 | 1322 | ||
1161 | /* time operations */ | 1323 | /* Time operations */ |
1162 | pv_time_ops.get_wallclock = lguest_get_wallclock; | 1324 | pv_time_ops.get_wallclock = lguest_get_wallclock; |
1163 | pv_time_ops.time_init = lguest_time_init; | 1325 | pv_time_ops.time_init = lguest_time_init; |
1164 | pv_time_ops.get_tsc_khz = lguest_tsc_khz; | 1326 | pv_time_ops.get_tsc_khz = lguest_tsc_khz; |
1165 | 1327 | ||
1166 | /* Now is a good time to look at the implementations of these functions | 1328 | /* |
1167 | * before returning to the rest of lguest_init(). */ | 1329 | * Now is a good time to look at the implementations of these functions |
1330 | * before returning to the rest of lguest_init(). | ||
1331 | */ | ||
1168 | 1332 | ||
1169 | /*G:070 Now we've seen all the paravirt_ops, we return to | 1333 | /*G:070 |
1334 | * Now we've seen all the paravirt_ops, we return to | ||
1170 | * lguest_init() where the rest of the fairly chaotic boot setup | 1335 | * lguest_init() where the rest of the fairly chaotic boot setup |
1171 | * occurs. */ | 1336 | * occurs. |
1337 | */ | ||
1172 | 1338 | ||
1173 | /* The stack protector is a weird thing where gcc places a canary | 1339 | /* |
1340 | * The stack protector is a weird thing where gcc places a canary | ||
1174 | * value on the stack and then checks it on return. This file is | 1341 | * value on the stack and then checks it on return. This file is |
1175 | * compiled with -fno-stack-protector it, so we got this far without | 1342 | * compiled with -fno-stack-protector it, so we got this far without |
1176 | * problems. The value of the canary is kept at offset 20 from the | 1343 | * problems. The value of the canary is kept at offset 20 from the |
1177 | * %gs register, so we need to set that up before calling C functions | 1344 | * %gs register, so we need to set that up before calling C functions |
1178 | * in other files. */ | 1345 | * in other files. |
1346 | */ | ||
1179 | setup_stack_canary_segment(0); | 1347 | setup_stack_canary_segment(0); |
1180 | /* We could just call load_stack_canary_segment(), but we might as | 1348 | |
1181 | * call switch_to_new_gdt() which loads the whole table and sets up | 1349 | /* |
1182 | * the per-cpu segment descriptor register %fs as well. */ | 1350 | * We could just call load_stack_canary_segment(), but we might as well |
1351 | * call switch_to_new_gdt() which loads the whole table and sets up the | ||
1352 | * per-cpu segment descriptor register %fs as well. | ||
1353 | */ | ||
1183 | switch_to_new_gdt(0); | 1354 | switch_to_new_gdt(0); |
1184 | 1355 | ||
1185 | /* As described in head_32.S, we map the first 128M of memory. */ | 1356 | /* We actually boot with all memory mapped, but let's say 128MB. */ |
1186 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; | 1357 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; |
1187 | 1358 | ||
1188 | /* The Host<->Guest Switcher lives at the top of our address space, and | 1359 | /* |
1360 | * The Host<->Guest Switcher lives at the top of our address space, and | ||
1189 | * the Host told us how big it is when we made LGUEST_INIT hypercall: | 1361 | * the Host told us how big it is when we made LGUEST_INIT hypercall: |
1190 | * it put the answer in lguest_data.reserve_mem */ | 1362 | * it put the answer in lguest_data.reserve_mem |
1363 | */ | ||
1191 | reserve_top_address(lguest_data.reserve_mem); | 1364 | reserve_top_address(lguest_data.reserve_mem); |
1192 | 1365 | ||
1193 | /* If we don't initialize the lock dependency checker now, it crashes | 1366 | /* |
1194 | * paravirt_disable_iospace. */ | 1367 | * If we don't initialize the lock dependency checker now, it crashes |
1368 | * paravirt_disable_iospace. | ||
1369 | */ | ||
1195 | lockdep_init(); | 1370 | lockdep_init(); |
1196 | 1371 | ||
1197 | /* The IDE code spends about 3 seconds probing for disks: if we reserve | 1372 | /* |
1373 | * The IDE code spends about 3 seconds probing for disks: if we reserve | ||
1198 | * all the I/O ports up front it can't get them and so doesn't probe. | 1374 | * all the I/O ports up front it can't get them and so doesn't probe. |
1199 | * Other device drivers are similar (but less severe). This cuts the | 1375 | * Other device drivers are similar (but less severe). This cuts the |
1200 | * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ | 1376 | * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. |
1377 | */ | ||
1201 | paravirt_disable_iospace(); | 1378 | paravirt_disable_iospace(); |
1202 | 1379 | ||
1203 | /* This is messy CPU setup stuff which the native boot code does before | 1380 | /* |
1204 | * start_kernel, so we have to do, too: */ | 1381 | * This is messy CPU setup stuff which the native boot code does before |
1382 | * start_kernel, so we have to do, too: | ||
1383 | */ | ||
1205 | cpu_detect(&new_cpu_data); | 1384 | cpu_detect(&new_cpu_data); |
1206 | /* head.S usually sets up the first capability word, so do it here. */ | 1385 | /* head.S usually sets up the first capability word, so do it here. */ |
1207 | new_cpu_data.x86_capability[0] = cpuid_edx(1); | 1386 | new_cpu_data.x86_capability[0] = cpuid_edx(1); |
@@ -1218,22 +1397,28 @@ __init void lguest_init(void) | |||
1218 | acpi_ht = 0; | 1397 | acpi_ht = 0; |
1219 | #endif | 1398 | #endif |
1220 | 1399 | ||
1221 | /* We set the preferred console to "hvc". This is the "hypervisor | 1400 | /* |
1401 | * We set the preferred console to "hvc". This is the "hypervisor | ||
1222 | * virtual console" driver written by the PowerPC people, which we also | 1402 | * virtual console" driver written by the PowerPC people, which we also |
1223 | * adapted for lguest's use. */ | 1403 | * adapted for lguest's use. |
1404 | */ | ||
1224 | add_preferred_console("hvc", 0, NULL); | 1405 | add_preferred_console("hvc", 0, NULL); |
1225 | 1406 | ||
1226 | /* Register our very early console. */ | 1407 | /* Register our very early console. */ |
1227 | virtio_cons_early_init(early_put_chars); | 1408 | virtio_cons_early_init(early_put_chars); |
1228 | 1409 | ||
1229 | /* Last of all, we set the power management poweroff hook to point to | 1410 | /* |
1411 | * Last of all, we set the power management poweroff hook to point to | ||
1230 | * the Guest routine to power off, and the reboot hook to our restart | 1412 | * the Guest routine to power off, and the reboot hook to our restart |
1231 | * routine. */ | 1413 | * routine. |
1414 | */ | ||
1232 | pm_power_off = lguest_power_off; | 1415 | pm_power_off = lguest_power_off; |
1233 | machine_ops.restart = lguest_restart; | 1416 | machine_ops.restart = lguest_restart; |
1234 | 1417 | ||
1235 | /* Now we're set up, call i386_start_kernel() in head32.c and we proceed | 1418 | /* |
1236 | * to boot as normal. It never returns. */ | 1419 | * Now we're set up, call i386_start_kernel() in head32.c and we proceed |
1420 | * to boot as normal. It never returns. | ||
1421 | */ | ||
1237 | i386_start_kernel(); | 1422 | i386_start_kernel(); |
1238 | } | 1423 | } |
1239 | /* | 1424 | /* |
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index a9c8cfe61cd4..27eac0faee48 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S | |||
@@ -5,7 +5,8 @@ | |||
5 | #include <asm/thread_info.h> | 5 | #include <asm/thread_info.h> |
6 | #include <asm/processor-flags.h> | 6 | #include <asm/processor-flags.h> |
7 | 7 | ||
8 | /*G:020 Our story starts with the kernel booting into startup_32 in | 8 | /*G:020 |
9 | * Our story starts with the kernel booting into startup_32 in | ||
9 | * arch/x86/kernel/head_32.S. It expects a boot header, which is created by | 10 | * arch/x86/kernel/head_32.S. It expects a boot header, which is created by |
10 | * the bootloader (the Launcher in our case). | 11 | * the bootloader (the Launcher in our case). |
11 | * | 12 | * |
@@ -21,11 +22,14 @@ | |||
21 | * data without remembering to subtract __PAGE_OFFSET! | 22 | * data without remembering to subtract __PAGE_OFFSET! |
22 | * | 23 | * |
23 | * The .section line puts this code in .init.text so it will be discarded after | 24 | * The .section line puts this code in .init.text so it will be discarded after |
24 | * boot. */ | 25 | * boot. |
26 | */ | ||
25 | .section .init.text, "ax", @progbits | 27 | .section .init.text, "ax", @progbits |
26 | ENTRY(lguest_entry) | 28 | ENTRY(lguest_entry) |
27 | /* We make the "initialization" hypercall now to tell the Host about | 29 | /* |
28 | * us, and also find out where it put our page tables. */ | 30 | * We make the "initialization" hypercall now to tell the Host about |
31 | * us, and also find out where it put our page tables. | ||
32 | */ | ||
29 | movl $LHCALL_LGUEST_INIT, %eax | 33 | movl $LHCALL_LGUEST_INIT, %eax |
30 | movl $lguest_data - __PAGE_OFFSET, %ebx | 34 | movl $lguest_data - __PAGE_OFFSET, %ebx |
31 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ | 35 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ |
@@ -33,13 +37,14 @@ ENTRY(lguest_entry) | |||
33 | /* Set up the initial stack so we can run C code. */ | 37 | /* Set up the initial stack so we can run C code. */ |
34 | movl $(init_thread_union+THREAD_SIZE),%esp | 38 | movl $(init_thread_union+THREAD_SIZE),%esp |
35 | 39 | ||
36 | /* Jumps are relative, and we're running __PAGE_OFFSET too low at the | 40 | /* Jumps are relative: we're running __PAGE_OFFSET too low. */ |
37 | * moment. */ | ||
38 | jmp lguest_init+__PAGE_OFFSET | 41 | jmp lguest_init+__PAGE_OFFSET |
39 | 42 | ||
40 | /*G:055 We create a macro which puts the assembler code between lgstart_ and | 43 | /*G:055 |
41 | * lgend_ markers. These templates are put in the .text section: they can't be | 44 | * We create a macro which puts the assembler code between lgstart_ and lgend_ |
42 | * discarded after boot as we may need to patch modules, too. */ | 45 | * markers. These templates are put in the .text section: they can't be |
46 | * discarded after boot as we may need to patch modules, too. | ||
47 | */ | ||
43 | .text | 48 | .text |
44 | #define LGUEST_PATCH(name, insns...) \ | 49 | #define LGUEST_PATCH(name, insns...) \ |
45 | lgstart_##name: insns; lgend_##name:; \ | 50 | lgstart_##name: insns; lgend_##name:; \ |
@@ -48,83 +53,103 @@ ENTRY(lguest_entry) | |||
48 | LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) | 53 | LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) |
49 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) | 54 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) |
50 | 55 | ||
51 | /*G:033 But using those wrappers is inefficient (we'll see why that doesn't | 56 | /*G:033 |
52 | * matter for save_fl and irq_disable later). If we write our routines | 57 | * But using those wrappers is inefficient (we'll see why that doesn't matter |
53 | * carefully in assembler, we can avoid clobbering any registers and avoid | 58 | * for save_fl and irq_disable later). If we write our routines carefully in |
54 | * jumping through the wrapper functions. | 59 | * assembler, we can avoid clobbering any registers and avoid jumping through |
60 | * the wrapper functions. | ||
55 | * | 61 | * |
56 | * I skipped over our first piece of assembler, but this one is worth studying | 62 | * I skipped over our first piece of assembler, but this one is worth studying |
57 | * in a bit more detail so I'll describe in easy stages. First, the routine | 63 | * in a bit more detail so I'll describe in easy stages. First, the routine to |
58 | * to enable interrupts: */ | 64 | * enable interrupts: |
65 | */ | ||
59 | ENTRY(lg_irq_enable) | 66 | ENTRY(lg_irq_enable) |
60 | /* The reverse of irq_disable, this sets lguest_data.irq_enabled to | 67 | /* |
61 | * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ | 68 | * The reverse of irq_disable, this sets lguest_data.irq_enabled to |
69 | * X86_EFLAGS_IF (ie. "Interrupts enabled"). | ||
70 | */ | ||
62 | movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled | 71 | movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled |
63 | /* But now we need to check if the Host wants to know: there might have | 72 | /* |
73 | * But now we need to check if the Host wants to know: there might have | ||
64 | * been interrupts waiting to be delivered, in which case it will have | 74 | * been interrupts waiting to be delivered, in which case it will have |
65 | * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we | 75 | * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we |
66 | * jump to send_interrupts, otherwise we're done. */ | 76 | * jump to send_interrupts, otherwise we're done. |
77 | */ | ||
67 | testl $0, lguest_data+LGUEST_DATA_irq_pending | 78 | testl $0, lguest_data+LGUEST_DATA_irq_pending |
68 | jnz send_interrupts | 79 | jnz send_interrupts |
69 | /* One cool thing about x86 is that you can do many things without using | 80 | /* |
81 | * One cool thing about x86 is that you can do many things without using | ||
70 | * a register. In this case, the normal path hasn't needed to save or | 82 | * a register. In this case, the normal path hasn't needed to save or |
71 | * restore any registers at all! */ | 83 | * restore any registers at all! |
84 | */ | ||
72 | ret | 85 | ret |
73 | send_interrupts: | 86 | send_interrupts: |
74 | /* OK, now we need a register: eax is used for the hypercall number, | 87 | /* |
88 | * OK, now we need a register: eax is used for the hypercall number, | ||
75 | * which is LHCALL_SEND_INTERRUPTS. | 89 | * which is LHCALL_SEND_INTERRUPTS. |
76 | * | 90 | * |
77 | * We used not to bother with this pending detection at all, which was | 91 | * We used not to bother with this pending detection at all, which was |
78 | * much simpler. Sooner or later the Host would realize it had to | 92 | * much simpler. Sooner or later the Host would realize it had to |
79 | * send us an interrupt. But that turns out to make performance 7 | 93 | * send us an interrupt. But that turns out to make performance 7 |
80 | * times worse on a simple tcp benchmark. So now we do this the hard | 94 | * times worse on a simple tcp benchmark. So now we do this the hard |
81 | * way. */ | 95 | * way. |
96 | */ | ||
82 | pushl %eax | 97 | pushl %eax |
83 | movl $LHCALL_SEND_INTERRUPTS, %eax | 98 | movl $LHCALL_SEND_INTERRUPTS, %eax |
84 | /* This is a vmcall instruction (same thing that KVM uses). Older | 99 | /* |
100 | * This is a vmcall instruction (same thing that KVM uses). Older | ||
85 | * assembler versions might not know the "vmcall" instruction, so we | 101 | * assembler versions might not know the "vmcall" instruction, so we |
86 | * create one manually here. */ | 102 | * create one manually here. |
103 | */ | ||
87 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ | 104 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ |
105 | /* Put eax back the way we found it. */ | ||
88 | popl %eax | 106 | popl %eax |
89 | ret | 107 | ret |
90 | 108 | ||
91 | /* Finally, the "popf" or "restore flags" routine. The %eax register holds the | 109 | /* |
110 | * Finally, the "popf" or "restore flags" routine. The %eax register holds the | ||
92 | * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're | 111 | * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're |
93 | * enabling interrupts again, if it's 0 we're leaving them off. */ | 112 | * enabling interrupts again, if it's 0 we're leaving them off. |
113 | */ | ||
94 | ENTRY(lg_restore_fl) | 114 | ENTRY(lg_restore_fl) |
95 | /* This is just "lguest_data.irq_enabled = flags;" */ | 115 | /* This is just "lguest_data.irq_enabled = flags;" */ |
96 | movl %eax, lguest_data+LGUEST_DATA_irq_enabled | 116 | movl %eax, lguest_data+LGUEST_DATA_irq_enabled |
97 | /* Now, if the %eax value has enabled interrupts and | 117 | /* |
118 | * Now, if the %eax value has enabled interrupts and | ||
98 | * lguest_data.irq_pending is set, we want to tell the Host so it can | 119 | * lguest_data.irq_pending is set, we want to tell the Host so it can |
99 | * deliver any outstanding interrupts. Fortunately, both values will | 120 | * deliver any outstanding interrupts. Fortunately, both values will |
100 | * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" | 121 | * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" |
101 | * instruction will AND them together for us. If both are set, we | 122 | * instruction will AND them together for us. If both are set, we |
102 | * jump to send_interrupts. */ | 123 | * jump to send_interrupts. |
124 | */ | ||
103 | testl lguest_data+LGUEST_DATA_irq_pending, %eax | 125 | testl lguest_data+LGUEST_DATA_irq_pending, %eax |
104 | jnz send_interrupts | 126 | jnz send_interrupts |
105 | /* Again, the normal path has used no extra registers. Clever, huh? */ | 127 | /* Again, the normal path has used no extra registers. Clever, huh? */ |
106 | ret | 128 | ret |
129 | /*:*/ | ||
107 | 130 | ||
108 | /* These demark the EIP range where host should never deliver interrupts. */ | 131 | /* These demark the EIP range where host should never deliver interrupts. */ |
109 | .global lguest_noirq_start | 132 | .global lguest_noirq_start |
110 | .global lguest_noirq_end | 133 | .global lguest_noirq_end |
111 | 134 | ||
112 | /*M:004 When the Host reflects a trap or injects an interrupt into the Guest, | 135 | /*M:004 |
113 | * it sets the eflags interrupt bit on the stack based on | 136 | * When the Host reflects a trap or injects an interrupt into the Guest, it |
114 | * lguest_data.irq_enabled, so the Guest iret logic does the right thing when | 137 | * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, |
115 | * restoring it. However, when the Host sets the Guest up for direct traps, | 138 | * so the Guest iret logic does the right thing when restoring it. However, |
116 | * such as system calls, the processor is the one to push eflags onto the | 139 | * when the Host sets the Guest up for direct traps, such as system calls, the |
117 | * stack, and the interrupt bit will be 1 (in reality, interrupts are always | 140 | * processor is the one to push eflags onto the stack, and the interrupt bit |
118 | * enabled in the Guest). | 141 | * will be 1 (in reality, interrupts are always enabled in the Guest). |
119 | * | 142 | * |
120 | * This turns out to be harmless: the only trap which should happen under Linux | 143 | * This turns out to be harmless: the only trap which should happen under Linux |
121 | * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc | 144 | * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc |
122 | * regions), which has to be reflected through the Host anyway. If another | 145 | * regions), which has to be reflected through the Host anyway. If another |
123 | * trap *does* go off when interrupts are disabled, the Guest will panic, and | 146 | * trap *does* go off when interrupts are disabled, the Guest will panic, and |
124 | * we'll never get to this iret! :*/ | 147 | * we'll never get to this iret! |
148 | :*/ | ||
125 | 149 | ||
126 | /*G:045 There is one final paravirt_op that the Guest implements, and glancing | 150 | /*G:045 |
127 | * at it you can see why I left it to last. It's *cool*! It's in *assembler*! | 151 | * There is one final paravirt_op that the Guest implements, and glancing at it |
152 | * you can see why I left it to last. It's *cool*! It's in *assembler*! | ||
128 | * | 153 | * |
129 | * The "iret" instruction is used to return from an interrupt or trap. The | 154 | * The "iret" instruction is used to return from an interrupt or trap. The |
130 | * stack looks like this: | 155 | * stack looks like this: |
@@ -148,15 +173,18 @@ ENTRY(lg_restore_fl) | |||
148 | * return to userspace or wherever. Our solution to this is to surround the | 173 | * return to userspace or wherever. Our solution to this is to surround the |
149 | * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the | 174 | * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the |
150 | * Host that it is *never* to interrupt us there, even if interrupts seem to be | 175 | * Host that it is *never* to interrupt us there, even if interrupts seem to be |
151 | * enabled. */ | 176 | * enabled. |
177 | */ | ||
152 | ENTRY(lguest_iret) | 178 | ENTRY(lguest_iret) |
153 | pushl %eax | 179 | pushl %eax |
154 | movl 12(%esp), %eax | 180 | movl 12(%esp), %eax |
155 | lguest_noirq_start: | 181 | lguest_noirq_start: |
156 | /* Note the %ss: segment prefix here. Normal data accesses use the | 182 | /* |
183 | * Note the %ss: segment prefix here. Normal data accesses use the | ||
157 | * "ds" segment, but that will have already been restored for whatever | 184 | * "ds" segment, but that will have already been restored for whatever |
158 | * we're returning to (such as userspace): we can't trust it. The %ss: | 185 | * we're returning to (such as userspace): we can't trust it. The %ss: |
159 | * prefix makes sure we use the stack segment, which is still valid. */ | 186 | * prefix makes sure we use the stack segment, which is still valid. |
187 | */ | ||
160 | movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled | 188 | movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled |
161 | popl %eax | 189 | popl %eax |
162 | iret | 190 | iret |
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 1440b9c0547e..caa24aca8115 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c | |||
@@ -89,16 +89,13 @@ void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) | |||
89 | rv.msrs = msrs; | 89 | rv.msrs = msrs; |
90 | rv.msr_no = msr_no; | 90 | rv.msr_no = msr_no; |
91 | 91 | ||
92 | preempt_disable(); | 92 | this_cpu = get_cpu(); |
93 | /* | 93 | |
94 | * FIXME: handle the CPU we're executing on separately for now until | 94 | if (cpumask_test_cpu(this_cpu, mask)) |
95 | * smp_call_function_many has been fixed to not skip it. | 95 | __rdmsr_on_cpu(&rv); |
96 | */ | ||
97 | this_cpu = raw_smp_processor_id(); | ||
98 | smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1); | ||
99 | 96 | ||
100 | smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1); | 97 | smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1); |
101 | preempt_enable(); | 98 | put_cpu(); |
102 | } | 99 | } |
103 | EXPORT_SYMBOL(rdmsr_on_cpus); | 100 | EXPORT_SYMBOL(rdmsr_on_cpus); |
104 | 101 | ||
@@ -121,16 +118,13 @@ void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) | |||
121 | rv.msrs = msrs; | 118 | rv.msrs = msrs; |
122 | rv.msr_no = msr_no; | 119 | rv.msr_no = msr_no; |
123 | 120 | ||
124 | preempt_disable(); | 121 | this_cpu = get_cpu(); |
125 | /* | 122 | |
126 | * FIXME: handle the CPU we're executing on separately for now until | 123 | if (cpumask_test_cpu(this_cpu, mask)) |
127 | * smp_call_function_many has been fixed to not skip it. | 124 | __wrmsr_on_cpu(&rv); |
128 | */ | ||
129 | this_cpu = raw_smp_processor_id(); | ||
130 | smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1); | ||
131 | 125 | ||
132 | smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1); | 126 | smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1); |
133 | preempt_enable(); | 127 | put_cpu(); |
134 | } | 128 | } |
135 | EXPORT_SYMBOL(wrmsr_on_cpus); | 129 | EXPORT_SYMBOL(wrmsr_on_cpus); |
136 | 130 | ||
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 58f621e81919..2112ed55e7ea 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c | |||
@@ -103,6 +103,7 @@ EXPORT_SYMBOL(kmap); | |||
103 | EXPORT_SYMBOL(kunmap); | 103 | EXPORT_SYMBOL(kunmap); |
104 | EXPORT_SYMBOL(kmap_atomic); | 104 | EXPORT_SYMBOL(kmap_atomic); |
105 | EXPORT_SYMBOL(kunmap_atomic); | 105 | EXPORT_SYMBOL(kunmap_atomic); |
106 | EXPORT_SYMBOL(kmap_atomic_prot); | ||
106 | 107 | ||
107 | void __init set_highmem_pages_init(void) | 108 | void __init set_highmem_pages_init(void) |
108 | { | 109 | { |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6176fe8f29e0..ea56b8cbb6a6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -796,7 +796,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | |||
796 | return ret; | 796 | return ret; |
797 | 797 | ||
798 | #else | 798 | #else |
799 | reserve_bootmem(phys, len, BOOTMEM_DEFAULT); | 799 | reserve_bootmem(phys, len, flags); |
800 | #endif | 800 | #endif |
801 | 801 | ||
802 | if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { | 802 | if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { |
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1b734d7a8966..7e600c1962db 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -591,9 +591,12 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) | |||
591 | unsigned int level; | 591 | unsigned int level; |
592 | pte_t *kpte, old_pte; | 592 | pte_t *kpte, old_pte; |
593 | 593 | ||
594 | if (cpa->flags & CPA_PAGES_ARRAY) | 594 | if (cpa->flags & CPA_PAGES_ARRAY) { |
595 | address = (unsigned long)page_address(cpa->pages[cpa->curpage]); | 595 | struct page *page = cpa->pages[cpa->curpage]; |
596 | else if (cpa->flags & CPA_ARRAY) | 596 | if (unlikely(PageHighMem(page))) |
597 | return 0; | ||
598 | address = (unsigned long)page_address(page); | ||
599 | } else if (cpa->flags & CPA_ARRAY) | ||
597 | address = cpa->vaddr[cpa->curpage]; | 600 | address = cpa->vaddr[cpa->curpage]; |
598 | else | 601 | else |
599 | address = *cpa->vaddr; | 602 | address = *cpa->vaddr; |
@@ -697,9 +700,12 @@ static int cpa_process_alias(struct cpa_data *cpa) | |||
697 | * No need to redo, when the primary call touched the direct | 700 | * No need to redo, when the primary call touched the direct |
698 | * mapping already: | 701 | * mapping already: |
699 | */ | 702 | */ |
700 | if (cpa->flags & CPA_PAGES_ARRAY) | 703 | if (cpa->flags & CPA_PAGES_ARRAY) { |
701 | vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]); | 704 | struct page *page = cpa->pages[cpa->curpage]; |
702 | else if (cpa->flags & CPA_ARRAY) | 705 | if (unlikely(PageHighMem(page))) |
706 | return 0; | ||
707 | vaddr = (unsigned long)page_address(page); | ||
708 | } else if (cpa->flags & CPA_ARRAY) | ||
703 | vaddr = cpa->vaddr[cpa->curpage]; | 709 | vaddr = cpa->vaddr[cpa->curpage]; |
704 | else | 710 | else |
705 | vaddr = *cpa->vaddr; | 711 | vaddr = *cpa->vaddr; |
@@ -997,12 +1003,15 @@ EXPORT_SYMBOL(set_memory_array_uc); | |||
997 | int _set_memory_wc(unsigned long addr, int numpages) | 1003 | int _set_memory_wc(unsigned long addr, int numpages) |
998 | { | 1004 | { |
999 | int ret; | 1005 | int ret; |
1006 | unsigned long addr_copy = addr; | ||
1007 | |||
1000 | ret = change_page_attr_set(&addr, numpages, | 1008 | ret = change_page_attr_set(&addr, numpages, |
1001 | __pgprot(_PAGE_CACHE_UC_MINUS), 0); | 1009 | __pgprot(_PAGE_CACHE_UC_MINUS), 0); |
1002 | |||
1003 | if (!ret) { | 1010 | if (!ret) { |
1004 | ret = change_page_attr_set(&addr, numpages, | 1011 | ret = change_page_attr_set_clr(&addr_copy, numpages, |
1005 | __pgprot(_PAGE_CACHE_WC), 0); | 1012 | __pgprot(_PAGE_CACHE_WC), |
1013 | __pgprot(_PAGE_CACHE_MASK), | ||
1014 | 0, 0, NULL); | ||
1006 | } | 1015 | } |
1007 | return ret; | 1016 | return ret; |
1008 | } | 1017 | } |
@@ -1119,7 +1128,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray) | |||
1119 | int free_idx; | 1128 | int free_idx; |
1120 | 1129 | ||
1121 | for (i = 0; i < addrinarray; i++) { | 1130 | for (i = 0; i < addrinarray; i++) { |
1122 | start = (unsigned long)page_address(pages[i]); | 1131 | if (PageHighMem(pages[i])) |
1132 | continue; | ||
1133 | start = page_to_pfn(pages[i]) << PAGE_SHIFT; | ||
1123 | end = start + PAGE_SIZE; | 1134 | end = start + PAGE_SIZE; |
1124 | if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) | 1135 | if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) |
1125 | goto err_out; | 1136 | goto err_out; |
@@ -1132,7 +1143,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray) | |||
1132 | err_out: | 1143 | err_out: |
1133 | free_idx = i; | 1144 | free_idx = i; |
1134 | for (i = 0; i < free_idx; i++) { | 1145 | for (i = 0; i < free_idx; i++) { |
1135 | start = (unsigned long)page_address(pages[i]); | 1146 | if (PageHighMem(pages[i])) |
1147 | continue; | ||
1148 | start = page_to_pfn(pages[i]) << PAGE_SHIFT; | ||
1136 | end = start + PAGE_SIZE; | 1149 | end = start + PAGE_SIZE; |
1137 | free_memtype(start, end); | 1150 | free_memtype(start, end); |
1138 | } | 1151 | } |
@@ -1161,7 +1174,9 @@ int set_pages_array_wb(struct page **pages, int addrinarray) | |||
1161 | return retval; | 1174 | return retval; |
1162 | 1175 | ||
1163 | for (i = 0; i < addrinarray; i++) { | 1176 | for (i = 0; i < addrinarray; i++) { |
1164 | start = (unsigned long)page_address(pages[i]); | 1177 | if (PageHighMem(pages[i])) |
1178 | continue; | ||
1179 | start = page_to_pfn(pages[i]) << PAGE_SHIFT; | ||
1165 | end = start + PAGE_SIZE; | 1180 | end = start + PAGE_SIZE; |
1166 | free_memtype(start, end); | 1181 | free_memtype(start, end); |
1167 | } | 1182 | } |
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index e6718bb28065..352aa9e927e2 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c | |||
@@ -623,7 +623,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, | |||
623 | return ret; | 623 | return ret; |
624 | 624 | ||
625 | if (flags != want_flags) { | 625 | if (flags != want_flags) { |
626 | if (strict_prot || !is_new_memtype_allowed(want_flags, flags)) { | 626 | if (strict_prot || |
627 | !is_new_memtype_allowed(paddr, size, want_flags, flags)) { | ||
627 | free_memtype(paddr, paddr + size); | 628 | free_memtype(paddr, paddr + size); |
628 | printk(KERN_ERR "%s:%d map pfn expected mapping type %s" | 629 | printk(KERN_ERR "%s:%d map pfn expected mapping type %s" |
629 | " for %Lx-%Lx, got %s\n", | 630 | " for %Lx-%Lx, got %s\n", |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8e43bdd45456..ed34f5e35999 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -25,7 +25,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) | |||
25 | return pte; | 25 | return pte; |
26 | } | 26 | } |
27 | 27 | ||
28 | void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) | 28 | void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) |
29 | { | 29 | { |
30 | pgtable_page_dtor(pte); | 30 | pgtable_page_dtor(pte); |
31 | paravirt_release_pte(page_to_pfn(pte)); | 31 | paravirt_release_pte(page_to_pfn(pte)); |
@@ -33,14 +33,14 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) | |||
33 | } | 33 | } |
34 | 34 | ||
35 | #if PAGETABLE_LEVELS > 2 | 35 | #if PAGETABLE_LEVELS > 2 |
36 | void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) | 36 | void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) |
37 | { | 37 | { |
38 | paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); | 38 | paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); |
39 | tlb_remove_page(tlb, virt_to_page(pmd)); | 39 | tlb_remove_page(tlb, virt_to_page(pmd)); |
40 | } | 40 | } |
41 | 41 | ||
42 | #if PAGETABLE_LEVELS > 3 | 42 | #if PAGETABLE_LEVELS > 3 |
43 | void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) | 43 | void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) |
44 | { | 44 | { |
45 | paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); | 45 | paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); |
46 | tlb_remove_page(tlb, virt_to_page(pud)); | 46 | tlb_remove_page(tlb, virt_to_page(pud)); |
@@ -329,7 +329,6 @@ void __init reserve_top_address(unsigned long reserve) | |||
329 | printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", | 329 | printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", |
330 | (int)-reserve); | 330 | (int)-reserve); |
331 | __FIXADDR_TOP = -reserve - PAGE_SIZE; | 331 | __FIXADDR_TOP = -reserve - PAGE_SIZE; |
332 | __VMALLOC_RESERVE += reserve; | ||
333 | #endif | 332 | #endif |
334 | } | 333 | } |
335 | 334 | ||
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 2dfcbf9df2ae..dbb5381f7b3b 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c | |||
@@ -79,8 +79,10 @@ static __init void bad_srat(void) | |||
79 | acpi_numa = -1; | 79 | acpi_numa = -1; |
80 | for (i = 0; i < MAX_LOCAL_APIC; i++) | 80 | for (i = 0; i < MAX_LOCAL_APIC; i++) |
81 | apicid_to_node[i] = NUMA_NO_NODE; | 81 | apicid_to_node[i] = NUMA_NO_NODE; |
82 | for (i = 0; i < MAX_NUMNODES; i++) | 82 | for (i = 0; i < MAX_NUMNODES; i++) { |
83 | nodes_add[i].start = nodes[i].end = 0; | 83 | nodes[i].start = nodes[i].end = 0; |
84 | nodes_add[i].start = nodes_add[i].end = 0; | ||
85 | } | ||
84 | remove_all_active_ranges(); | 86 | remove_all_active_ranges(); |
85 | } | 87 | } |
86 | 88 | ||
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 821e97017e95..c814e144a3f0 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -183,18 +183,17 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, | |||
183 | 183 | ||
184 | f->flush_mm = mm; | 184 | f->flush_mm = mm; |
185 | f->flush_va = va; | 185 | f->flush_va = va; |
186 | cpumask_andnot(to_cpumask(f->flush_cpumask), | 186 | if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) { |
187 | cpumask, cpumask_of(smp_processor_id())); | 187 | /* |
188 | 188 | * We have to send the IPI only to | |
189 | /* | 189 | * CPUs affected. |
190 | * We have to send the IPI only to | 190 | */ |
191 | * CPUs affected. | 191 | apic->send_IPI_mask(to_cpumask(f->flush_cpumask), |
192 | */ | 192 | INVALIDATE_TLB_VECTOR_START + sender); |
193 | apic->send_IPI_mask(to_cpumask(f->flush_cpumask), | ||
194 | INVALIDATE_TLB_VECTOR_START + sender); | ||
195 | 193 | ||
196 | while (!cpumask_empty(to_cpumask(f->flush_cpumask))) | 194 | while (!cpumask_empty(to_cpumask(f->flush_cpumask))) |
197 | cpu_relax(); | 195 | cpu_relax(); |
196 | } | ||
198 | 197 | ||
199 | f->flush_mm = NULL; | 198 | f->flush_mm = NULL; |
200 | f->flush_va = 0; | 199 | f->flush_va = 0; |
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 172438f86a02..7410640db173 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile | |||
@@ -5,6 +5,10 @@ CFLAGS_REMOVE_time.o = -pg | |||
5 | CFLAGS_REMOVE_irq.o = -pg | 5 | CFLAGS_REMOVE_irq.o = -pg |
6 | endif | 6 | endif |
7 | 7 | ||
8 | # Make sure early boot has no stackprotector | ||
9 | nostackp := $(call cc-option, -fno-stack-protector) | ||
10 | CFLAGS_enlighten.o := $(nostackp) | ||
11 | |||
8 | obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ | 12 | obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ |
9 | time.o xen-asm.o xen-asm_$(BITS).o \ | 13 | time.o xen-asm.o xen-asm_$(BITS).o \ |
10 | grant-table.o suspend.o | 14 | grant-table.o suspend.o |
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0a1700a2be9c..e90540a46a0b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -974,10 +974,6 @@ asmlinkage void __init xen_start_kernel(void) | |||
974 | 974 | ||
975 | xen_domain_type = XEN_PV_DOMAIN; | 975 | xen_domain_type = XEN_PV_DOMAIN; |
976 | 976 | ||
977 | BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); | ||
978 | |||
979 | xen_setup_features(); | ||
980 | |||
981 | /* Install Xen paravirt ops */ | 977 | /* Install Xen paravirt ops */ |
982 | pv_info = xen_info; | 978 | pv_info = xen_info; |
983 | pv_init_ops = xen_init_ops; | 979 | pv_init_ops = xen_init_ops; |
@@ -986,8 +982,15 @@ asmlinkage void __init xen_start_kernel(void) | |||
986 | pv_apic_ops = xen_apic_ops; | 982 | pv_apic_ops = xen_apic_ops; |
987 | pv_mmu_ops = xen_mmu_ops; | 983 | pv_mmu_ops = xen_mmu_ops; |
988 | 984 | ||
989 | xen_init_irq_ops(); | 985 | #ifdef CONFIG_X86_64 |
986 | /* | ||
987 | * Setup percpu state. We only need to do this for 64-bit | ||
988 | * because 32-bit already has %fs set properly. | ||
989 | */ | ||
990 | load_percpu_segment(0); | ||
991 | #endif | ||
990 | 992 | ||
993 | xen_init_irq_ops(); | ||
991 | xen_init_cpuid_mask(); | 994 | xen_init_cpuid_mask(); |
992 | 995 | ||
993 | #ifdef CONFIG_X86_LOCAL_APIC | 996 | #ifdef CONFIG_X86_LOCAL_APIC |
@@ -997,6 +1000,8 @@ asmlinkage void __init xen_start_kernel(void) | |||
997 | set_xen_basic_apic_ops(); | 1000 | set_xen_basic_apic_ops(); |
998 | #endif | 1001 | #endif |
999 | 1002 | ||
1003 | xen_setup_features(); | ||
1004 | |||
1000 | if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { | 1005 | if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { |
1001 | pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; | 1006 | pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; |
1002 | pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; | 1007 | pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; |
@@ -1004,13 +1009,6 @@ asmlinkage void __init xen_start_kernel(void) | |||
1004 | 1009 | ||
1005 | machine_ops = xen_machine_ops; | 1010 | machine_ops = xen_machine_ops; |
1006 | 1011 | ||
1007 | #ifdef CONFIG_X86_64 | ||
1008 | /* | ||
1009 | * Setup percpu state. We only need to do this for 64-bit | ||
1010 | * because 32-bit already has %fs set properly. | ||
1011 | */ | ||
1012 | load_percpu_segment(0); | ||
1013 | #endif | ||
1014 | /* | 1012 | /* |
1015 | * The only reliable way to retain the initial address of the | 1013 | * The only reliable way to retain the initial address of the |
1016 | * percpu gdt_page is to remember it here, so we can go and | 1014 | * percpu gdt_page is to remember it here, so we can go and |