aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/boot/header.S26
-rw-r--r--arch/x86/boot/tools/build.c38
-rw-r--r--arch/x86/include/asm/irqflags.h2
-rw-r--r--arch/x86/kernel/apm_32.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c22
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c12
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c10
-rw-r--r--arch/x86/kernel/cpu/perf_event.c3
-rw-r--r--arch/x86/kernel/cpu/perf_event.h12
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c78
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c11
-rw-r--r--arch/x86/kernel/entry_32.S9
-rw-r--r--arch/x86/kernel/entry_64.S28
-rw-r--r--arch/x86/kernel/espfix_64.c5
-rw-r--r--arch/x86/kernel/kprobes/core.c3
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c2
-rw-r--r--arch/x86/kernel/tsc.c4
-rw-r--r--arch/x86/kvm/x86.c12
-rw-r--r--arch/x86/xen/grant-table.c148
21 files changed, 305 insertions, 128 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a8f749ef0fdc..d24887b645dc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -131,6 +131,7 @@ config X86
131 select HAVE_CC_STACKPROTECTOR 131 select HAVE_CC_STACKPROTECTOR
132 select GENERIC_CPU_AUTOPROBE 132 select GENERIC_CPU_AUTOPROBE
133 select HAVE_ARCH_AUDITSYSCALL 133 select HAVE_ARCH_AUDITSYSCALL
134 select ARCH_SUPPORTS_ATOMIC_RMW
134 135
135config INSTRUCTION_DECODER 136config INSTRUCTION_DECODER
136 def_bool y 137 def_bool y
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 84c223479e3c..7a6d43a554d7 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -91,10 +91,9 @@ bs_die:
91 91
92 .section ".bsdata", "a" 92 .section ".bsdata", "a"
93bugger_off_msg: 93bugger_off_msg:
94 .ascii "Direct floppy boot is not supported. " 94 .ascii "Use a boot loader.\r\n"
95 .ascii "Use a boot loader program instead.\r\n"
96 .ascii "\n" 95 .ascii "\n"
97 .ascii "Remove disk and press any key to reboot ...\r\n" 96 .ascii "Remove disk and press any key to reboot...\r\n"
98 .byte 0 97 .byte 0
99 98
100#ifdef CONFIG_EFI_STUB 99#ifdef CONFIG_EFI_STUB
@@ -108,7 +107,7 @@ coff_header:
108#else 107#else
109 .word 0x8664 # x86-64 108 .word 0x8664 # x86-64
110#endif 109#endif
111 .word 3 # nr_sections 110 .word 4 # nr_sections
112 .long 0 # TimeDateStamp 111 .long 0 # TimeDateStamp
113 .long 0 # PointerToSymbolTable 112 .long 0 # PointerToSymbolTable
114 .long 1 # NumberOfSymbols 113 .long 1 # NumberOfSymbols
@@ -250,6 +249,25 @@ section_table:
250 .word 0 # NumberOfLineNumbers 249 .word 0 # NumberOfLineNumbers
251 .long 0x60500020 # Characteristics (section flags) 250 .long 0x60500020 # Characteristics (section flags)
252 251
252 #
253 # The offset & size fields are filled in by build.c.
254 #
255 .ascii ".bss"
256 .byte 0
257 .byte 0
258 .byte 0
259 .byte 0
260 .long 0
261 .long 0x0
262 .long 0 # Size of initialized data
263 # on disk
264 .long 0x0
265 .long 0 # PointerToRelocations
266 .long 0 # PointerToLineNumbers
267 .word 0 # NumberOfRelocations
268 .word 0 # NumberOfLineNumbers
269 .long 0xc8000080 # Characteristics (section flags)
270
253#endif /* CONFIG_EFI_STUB */ 271#endif /* CONFIG_EFI_STUB */
254 272
255 # Kernel attributes; used by setup. This is part 1 of the 273 # Kernel attributes; used by setup. This is part 1 of the
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 1a2f2121cada..a7661c430cd9 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -143,7 +143,7 @@ static void usage(void)
143 143
144#ifdef CONFIG_EFI_STUB 144#ifdef CONFIG_EFI_STUB
145 145
146static void update_pecoff_section_header(char *section_name, u32 offset, u32 size) 146static void update_pecoff_section_header_fields(char *section_name, u32 vma, u32 size, u32 datasz, u32 offset)
147{ 147{
148 unsigned int pe_header; 148 unsigned int pe_header;
149 unsigned short num_sections; 149 unsigned short num_sections;
@@ -164,10 +164,10 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz
164 put_unaligned_le32(size, section + 0x8); 164 put_unaligned_le32(size, section + 0x8);
165 165
166 /* section header vma field */ 166 /* section header vma field */
167 put_unaligned_le32(offset, section + 0xc); 167 put_unaligned_le32(vma, section + 0xc);
168 168
169 /* section header 'size of initialised data' field */ 169 /* section header 'size of initialised data' field */
170 put_unaligned_le32(size, section + 0x10); 170 put_unaligned_le32(datasz, section + 0x10);
171 171
172 /* section header 'file offset' field */ 172 /* section header 'file offset' field */
173 put_unaligned_le32(offset, section + 0x14); 173 put_unaligned_le32(offset, section + 0x14);
@@ -179,6 +179,11 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz
179 } 179 }
180} 180}
181 181
182static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
183{
184 update_pecoff_section_header_fields(section_name, offset, size, size, offset);
185}
186
182static void update_pecoff_setup_and_reloc(unsigned int size) 187static void update_pecoff_setup_and_reloc(unsigned int size)
183{ 188{
184 u32 setup_offset = 0x200; 189 u32 setup_offset = 0x200;
@@ -203,9 +208,6 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
203 208
204 pe_header = get_unaligned_le32(&buf[0x3c]); 209 pe_header = get_unaligned_le32(&buf[0x3c]);
205 210
206 /* Size of image */
207 put_unaligned_le32(file_sz, &buf[pe_header + 0x50]);
208
209 /* 211 /*
210 * Size of code: Subtract the size of the first sector (512 bytes) 212 * Size of code: Subtract the size of the first sector (512 bytes)
211 * which includes the header. 213 * which includes the header.
@@ -220,6 +222,22 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
220 update_pecoff_section_header(".text", text_start, text_sz); 222 update_pecoff_section_header(".text", text_start, text_sz);
221} 223}
222 224
225static void update_pecoff_bss(unsigned int file_sz, unsigned int init_sz)
226{
227 unsigned int pe_header;
228 unsigned int bss_sz = init_sz - file_sz;
229
230 pe_header = get_unaligned_le32(&buf[0x3c]);
231
232 /* Size of uninitialized data */
233 put_unaligned_le32(bss_sz, &buf[pe_header + 0x24]);
234
235 /* Size of image */
236 put_unaligned_le32(init_sz, &buf[pe_header + 0x50]);
237
238 update_pecoff_section_header_fields(".bss", file_sz, bss_sz, 0, 0);
239}
240
223static int reserve_pecoff_reloc_section(int c) 241static int reserve_pecoff_reloc_section(int c)
224{ 242{
225 /* Reserve 0x20 bytes for .reloc section */ 243 /* Reserve 0x20 bytes for .reloc section */
@@ -259,6 +277,8 @@ static void efi_stub_entry_update(void)
259static inline void update_pecoff_setup_and_reloc(unsigned int size) {} 277static inline void update_pecoff_setup_and_reloc(unsigned int size) {}
260static inline void update_pecoff_text(unsigned int text_start, 278static inline void update_pecoff_text(unsigned int text_start,
261 unsigned int file_sz) {} 279 unsigned int file_sz) {}
280static inline void update_pecoff_bss(unsigned int file_sz,
281 unsigned int init_sz) {}
262static inline void efi_stub_defaults(void) {} 282static inline void efi_stub_defaults(void) {}
263static inline void efi_stub_entry_update(void) {} 283static inline void efi_stub_entry_update(void) {}
264 284
@@ -310,7 +330,7 @@ static void parse_zoffset(char *fname)
310 330
311int main(int argc, char ** argv) 331int main(int argc, char ** argv)
312{ 332{
313 unsigned int i, sz, setup_sectors; 333 unsigned int i, sz, setup_sectors, init_sz;
314 int c; 334 int c;
315 u32 sys_size; 335 u32 sys_size;
316 struct stat sb; 336 struct stat sb;
@@ -376,7 +396,9 @@ int main(int argc, char ** argv)
376 buf[0x1f1] = setup_sectors-1; 396 buf[0x1f1] = setup_sectors-1;
377 put_unaligned_le32(sys_size, &buf[0x1f4]); 397 put_unaligned_le32(sys_size, &buf[0x1f4]);
378 398
379 update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz)); 399 update_pecoff_text(setup_sectors * 512, i + (sys_size * 16));
400 init_sz = get_unaligned_le32(&buf[0x260]);
401 update_pecoff_bss(i + (sys_size * 16), init_sz);
380 402
381 efi_stub_entry_update(); 403 efi_stub_entry_update();
382 404
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index bba3cf88e624..0a8b519226b8 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -129,7 +129,7 @@ static inline notrace unsigned long arch_local_irq_save(void)
129 129
130#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */ 130#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */
131 131
132#define INTERRUPT_RETURN iretq 132#define INTERRUPT_RETURN jmp native_iret
133#define USERGS_SYSRET64 \ 133#define USERGS_SYSRET64 \
134 swapgs; \ 134 swapgs; \
135 sysretq; 135 sysretq;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index f3a1f04ed4cb..584874451414 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -841,7 +841,6 @@ static int apm_do_idle(void)
841 u32 eax; 841 u32 eax;
842 u8 ret = 0; 842 u8 ret = 0;
843 int idled = 0; 843 int idled = 0;
844 int polling;
845 int err = 0; 844 int err = 0;
846 845
847 if (!need_resched()) { 846 if (!need_resched()) {
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index a80029035bf2..f9e4fdd3b877 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -370,6 +370,17 @@ static void init_intel(struct cpuinfo_x86 *c)
370 */ 370 */
371 detect_extended_topology(c); 371 detect_extended_topology(c);
372 372
373 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
374 /*
375 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
376 * detection.
377 */
378 c->x86_max_cores = intel_num_cpu_cores(c);
379#ifdef CONFIG_X86_32
380 detect_ht(c);
381#endif
382 }
383
373 l2 = init_intel_cacheinfo(c); 384 l2 = init_intel_cacheinfo(c);
374 if (c->cpuid_level > 9) { 385 if (c->cpuid_level > 9) {
375 unsigned eax = cpuid_eax(10); 386 unsigned eax = cpuid_eax(10);
@@ -438,17 +449,6 @@ static void init_intel(struct cpuinfo_x86 *c)
438 set_cpu_cap(c, X86_FEATURE_P3); 449 set_cpu_cap(c, X86_FEATURE_P3);
439#endif 450#endif
440 451
441 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
442 /*
443 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
444 * detection.
445 */
446 c->x86_max_cores = intel_num_cpu_cores(c);
447#ifdef CONFIG_X86_32
448 detect_ht(c);
449#endif
450 }
451
452 /* Work around errata */ 452 /* Work around errata */
453 srat_detect_node(c); 453 srat_detect_node(c);
454 454
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index a952e9c85b6f..9c8f7394c612 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -730,6 +730,18 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
730#endif 730#endif
731 } 731 }
732 732
733#ifdef CONFIG_X86_HT
734 /*
735 * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
736 * turns means that the only possibility is SMT (as indicated in
737 * cpuid1). Since cpuid2 doesn't specify shared caches, and we know
738 * that SMT shares all caches, we can unconditionally set cpu_llc_id to
739 * c->phys_proc_id.
740 */
741 if (per_cpu(cpu_llc_id, cpu) == BAD_APICID)
742 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
743#endif
744
733 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); 745 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
734 746
735 return l2; 747 return l2;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index bb92f38153b2..9a79c8dbd8e8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -2451,6 +2451,12 @@ static __init int mcheck_init_device(void)
2451 for_each_online_cpu(i) { 2451 for_each_online_cpu(i) {
2452 err = mce_device_create(i); 2452 err = mce_device_create(i);
2453 if (err) { 2453 if (err) {
2454 /*
2455 * Register notifier anyway (and do not unreg it) so
2456 * that we don't leave undeleted timers, see notifier
2457 * callback above.
2458 */
2459 __register_hotcpu_notifier(&mce_cpu_notifier);
2454 cpu_notifier_register_done(); 2460 cpu_notifier_register_done();
2455 goto err_device_create; 2461 goto err_device_create;
2456 } 2462 }
@@ -2471,10 +2477,6 @@ static __init int mcheck_init_device(void)
2471err_register: 2477err_register:
2472 unregister_syscore_ops(&mce_syscore_ops); 2478 unregister_syscore_ops(&mce_syscore_ops);
2473 2479
2474 cpu_notifier_register_begin();
2475 __unregister_hotcpu_notifier(&mce_cpu_notifier);
2476 cpu_notifier_register_done();
2477
2478err_device_create: 2480err_device_create:
2479 /* 2481 /*
2480 * We didn't keep track of which devices were created above, but 2482 * We didn't keep track of which devices were created above, but
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2bdfbff8a4f6..2879ecdaac43 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -118,6 +118,9 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
118 continue; 118 continue;
119 if (event->attr.config1 & ~er->valid_mask) 119 if (event->attr.config1 & ~er->valid_mask)
120 return -EINVAL; 120 return -EINVAL;
121 /* Check if the extra msrs can be safely accessed*/
122 if (!er->extra_msr_access)
123 return -ENXIO;
121 124
122 reg->idx = er->idx; 125 reg->idx = er->idx;
123 reg->config = event->attr.config1; 126 reg->config = event->attr.config1;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 3b2f9bdd974b..8ade93111e03 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -295,14 +295,16 @@ struct extra_reg {
295 u64 config_mask; 295 u64 config_mask;
296 u64 valid_mask; 296 u64 valid_mask;
297 int idx; /* per_xxx->regs[] reg index */ 297 int idx; /* per_xxx->regs[] reg index */
298 bool extra_msr_access;
298}; 299};
299 300
300#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \ 301#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
301 .event = (e), \ 302 .event = (e), \
302 .msr = (ms), \ 303 .msr = (ms), \
303 .config_mask = (m), \ 304 .config_mask = (m), \
304 .valid_mask = (vm), \ 305 .valid_mask = (vm), \
305 .idx = EXTRA_REG_##i, \ 306 .idx = EXTRA_REG_##i, \
307 .extra_msr_access = true, \
306 } 308 }
307 309
308#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \ 310#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index adb02aa62af5..2502d0d9d246 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1382,6 +1382,15 @@ again:
1382 intel_pmu_lbr_read(); 1382 intel_pmu_lbr_read();
1383 1383
1384 /* 1384 /*
1385 * CondChgd bit 63 doesn't mean any overflow status. Ignore
1386 * and clear the bit.
1387 */
1388 if (__test_and_clear_bit(63, (unsigned long *)&status)) {
1389 if (!status)
1390 goto done;
1391 }
1392
1393 /*
1385 * PEBS overflow sets bit 62 in the global status register 1394 * PEBS overflow sets bit 62 in the global status register
1386 */ 1395 */
1387 if (__test_and_clear_bit(62, (unsigned long *)&status)) { 1396 if (__test_and_clear_bit(62, (unsigned long *)&status)) {
@@ -2173,6 +2182,41 @@ static void intel_snb_check_microcode(void)
2173 } 2182 }
2174} 2183}
2175 2184
2185/*
2186 * Under certain circumstances, access certain MSR may cause #GP.
2187 * The function tests if the input MSR can be safely accessed.
2188 */
2189static bool check_msr(unsigned long msr, u64 mask)
2190{
2191 u64 val_old, val_new, val_tmp;
2192
2193 /*
2194 * Read the current value, change it and read it back to see if it
2195 * matches, this is needed to detect certain hardware emulators
2196 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
2197 */
2198 if (rdmsrl_safe(msr, &val_old))
2199 return false;
2200
2201 /*
2202 * Only change the bits which can be updated by wrmsrl.
2203 */
2204 val_tmp = val_old ^ mask;
2205 if (wrmsrl_safe(msr, val_tmp) ||
2206 rdmsrl_safe(msr, &val_new))
2207 return false;
2208
2209 if (val_new != val_tmp)
2210 return false;
2211
2212 /* Here it's sure that the MSR can be safely accessed.
2213 * Restore the old value and return.
2214 */
2215 wrmsrl(msr, val_old);
2216
2217 return true;
2218}
2219
2176static __init void intel_sandybridge_quirk(void) 2220static __init void intel_sandybridge_quirk(void)
2177{ 2221{
2178 x86_pmu.check_microcode = intel_snb_check_microcode; 2222 x86_pmu.check_microcode = intel_snb_check_microcode;
@@ -2262,7 +2306,8 @@ __init int intel_pmu_init(void)
2262 union cpuid10_ebx ebx; 2306 union cpuid10_ebx ebx;
2263 struct event_constraint *c; 2307 struct event_constraint *c;
2264 unsigned int unused; 2308 unsigned int unused;
2265 int version; 2309 struct extra_reg *er;
2310 int version, i;
2266 2311
2267 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { 2312 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
2268 switch (boot_cpu_data.x86) { 2313 switch (boot_cpu_data.x86) {
@@ -2465,6 +2510,9 @@ __init int intel_pmu_init(void)
2465 case 62: /* IvyBridge EP */ 2510 case 62: /* IvyBridge EP */
2466 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 2511 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
2467 sizeof(hw_cache_event_ids)); 2512 sizeof(hw_cache_event_ids));
2513 /* dTLB-load-misses on IVB is different than SNB */
2514 hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
2515
2468 memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, 2516 memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
2469 sizeof(hw_cache_extra_regs)); 2517 sizeof(hw_cache_extra_regs));
2470 2518
@@ -2565,6 +2613,34 @@ __init int intel_pmu_init(void)
2565 } 2613 }
2566 } 2614 }
2567 2615
2616 /*
2617 * Access LBR MSR may cause #GP under certain circumstances.
2618 * E.g. KVM doesn't support LBR MSR
2619 * Check all LBT MSR here.
2620 * Disable LBR access if any LBR MSRs can not be accessed.
2621 */
2622 if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
2623 x86_pmu.lbr_nr = 0;
2624 for (i = 0; i < x86_pmu.lbr_nr; i++) {
2625 if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
2626 check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
2627 x86_pmu.lbr_nr = 0;
2628 }
2629
2630 /*
2631 * Access extra MSR may cause #GP under certain circumstances.
2632 * E.g. KVM doesn't support offcore event
2633 * Check all extra_regs here.
2634 */
2635 if (x86_pmu.extra_regs) {
2636 for (er = x86_pmu.extra_regs; er->msr; er++) {
2637 er->extra_msr_access = check_msr(er->msr, 0x1ffUL);
2638 /* Disable LBR select mapping */
2639 if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
2640 x86_pmu.lbr_sel_map = NULL;
2641 }
2642 }
2643
2568 /* Support full width counters using alternative MSR range */ 2644 /* Support full width counters using alternative MSR range */
2569 if (x86_pmu.intel_cap.full_width_write) { 2645 if (x86_pmu.intel_cap.full_width_write) {
2570 x86_pmu.max_period = x86_pmu.cntval_mask; 2646 x86_pmu.max_period = x86_pmu.cntval_mask;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 980970cb744d..696ade311ded 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -311,9 +311,11 @@ static int alloc_bts_buffer(int cpu)
311 if (!x86_pmu.bts) 311 if (!x86_pmu.bts)
312 return 0; 312 return 0;
313 313
314 buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL, node); 314 buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
315 if (unlikely(!buffer)) 315 if (unlikely(!buffer)) {
316 WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
316 return -ENOMEM; 317 return -ENOMEM;
318 }
317 319
318 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; 320 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
319 thresh = max / 16; 321 thresh = max / 16;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 65bbbea38b9c..ae6552a0701f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -550,16 +550,16 @@ static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
550 SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6), 550 SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
551 SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8), 551 SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
552 SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8), 552 SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
553 SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xc), 553 SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
554 SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xc), 554 SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
555 SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2), 555 SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
556 SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2), 556 SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
557 SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2), 557 SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
558 SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2), 558 SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
559 SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8), 559 SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
560 SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8), 560 SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
561 SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xc), 561 SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
562 SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xc), 562 SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
563 SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2), 563 SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
564 SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2), 564 SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
565 SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2), 565 SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
@@ -1222,6 +1222,7 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
1222 SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, 1222 SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
1223 SNBEP_CBO_PMON_CTL_TID_EN, 0x1), 1223 SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
1224 SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2), 1224 SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
1225
1225 SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), 1226 SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
1226 SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), 1227 SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
1227 SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc), 1228 SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
@@ -1245,7 +1246,7 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
1245 SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10), 1246 SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
1246 SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10), 1247 SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
1247 SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10), 1248 SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
1248 SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10), 1249 SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
1249 SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10), 1250 SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
1250 SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18), 1251 SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
1251 SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18), 1252 SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index dbaa23e78b36..0d0c9d4ab6d5 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -425,8 +425,8 @@ sysenter_do_call:
425 cmpl $(NR_syscalls), %eax 425 cmpl $(NR_syscalls), %eax
426 jae sysenter_badsys 426 jae sysenter_badsys
427 call *sys_call_table(,%eax,4) 427 call *sys_call_table(,%eax,4)
428 movl %eax,PT_EAX(%esp)
429sysenter_after_call: 428sysenter_after_call:
429 movl %eax,PT_EAX(%esp)
430 LOCKDEP_SYS_EXIT 430 LOCKDEP_SYS_EXIT
431 DISABLE_INTERRUPTS(CLBR_ANY) 431 DISABLE_INTERRUPTS(CLBR_ANY)
432 TRACE_IRQS_OFF 432 TRACE_IRQS_OFF
@@ -502,6 +502,7 @@ ENTRY(system_call)
502 jae syscall_badsys 502 jae syscall_badsys
503syscall_call: 503syscall_call:
504 call *sys_call_table(,%eax,4) 504 call *sys_call_table(,%eax,4)
505syscall_after_call:
505 movl %eax,PT_EAX(%esp) # store the return value 506 movl %eax,PT_EAX(%esp) # store the return value
506syscall_exit: 507syscall_exit:
507 LOCKDEP_SYS_EXIT 508 LOCKDEP_SYS_EXIT
@@ -675,12 +676,12 @@ syscall_fault:
675END(syscall_fault) 676END(syscall_fault)
676 677
677syscall_badsys: 678syscall_badsys:
678 movl $-ENOSYS,PT_EAX(%esp) 679 movl $-ENOSYS,%eax
679 jmp syscall_exit 680 jmp syscall_after_call
680END(syscall_badsys) 681END(syscall_badsys)
681 682
682sysenter_badsys: 683sysenter_badsys:
683 movl $-ENOSYS,PT_EAX(%esp) 684 movl $-ENOSYS,%eax
684 jmp sysenter_after_call 685 jmp sysenter_after_call
685END(syscall_badsys) 686END(syscall_badsys)
686 CFI_ENDPROC 687 CFI_ENDPROC
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b25ca969edd2..c844f0816ab8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -830,27 +830,24 @@ restore_args:
830 RESTORE_ARGS 1,8,1 830 RESTORE_ARGS 1,8,1
831 831
832irq_return: 832irq_return:
833 INTERRUPT_RETURN
834
835ENTRY(native_iret)
833 /* 836 /*
834 * Are we returning to a stack segment from the LDT? Note: in 837 * Are we returning to a stack segment from the LDT? Note: in
835 * 64-bit mode SS:RSP on the exception stack is always valid. 838 * 64-bit mode SS:RSP on the exception stack is always valid.
836 */ 839 */
837#ifdef CONFIG_X86_ESPFIX64 840#ifdef CONFIG_X86_ESPFIX64
838 testb $4,(SS-RIP)(%rsp) 841 testb $4,(SS-RIP)(%rsp)
839 jnz irq_return_ldt 842 jnz native_irq_return_ldt
840#endif 843#endif
841 844
842irq_return_iret: 845native_irq_return_iret:
843 INTERRUPT_RETURN
844 _ASM_EXTABLE(irq_return_iret, bad_iret)
845
846#ifdef CONFIG_PARAVIRT
847ENTRY(native_iret)
848 iretq 846 iretq
849 _ASM_EXTABLE(native_iret, bad_iret) 847 _ASM_EXTABLE(native_irq_return_iret, bad_iret)
850#endif
851 848
852#ifdef CONFIG_X86_ESPFIX64 849#ifdef CONFIG_X86_ESPFIX64
853irq_return_ldt: 850native_irq_return_ldt:
854 pushq_cfi %rax 851 pushq_cfi %rax
855 pushq_cfi %rdi 852 pushq_cfi %rdi
856 SWAPGS 853 SWAPGS
@@ -872,7 +869,7 @@ irq_return_ldt:
872 SWAPGS 869 SWAPGS
873 movq %rax,%rsp 870 movq %rax,%rsp
874 popq_cfi %rax 871 popq_cfi %rax
875 jmp irq_return_iret 872 jmp native_irq_return_iret
876#endif 873#endif
877 874
878 .section .fixup,"ax" 875 .section .fixup,"ax"
@@ -956,13 +953,8 @@ __do_double_fault:
956 cmpl $__KERNEL_CS,CS(%rdi) 953 cmpl $__KERNEL_CS,CS(%rdi)
957 jne do_double_fault 954 jne do_double_fault
958 movq RIP(%rdi),%rax 955 movq RIP(%rdi),%rax
959 cmpq $irq_return_iret,%rax 956 cmpq $native_irq_return_iret,%rax
960#ifdef CONFIG_PARAVIRT
961 je 1f
962 cmpq $native_iret,%rax
963#endif
964 jne do_double_fault /* This shouldn't happen... */ 957 jne do_double_fault /* This shouldn't happen... */
9651:
966 movq PER_CPU_VAR(kernel_stack),%rax 958 movq PER_CPU_VAR(kernel_stack),%rax
967 subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */ 959 subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */
968 movq %rax,RSP(%rdi) 960 movq %rax,RSP(%rdi)
@@ -1428,7 +1420,7 @@ error_sti:
1428 */ 1420 */
1429error_kernelspace: 1421error_kernelspace:
1430 incl %ebx 1422 incl %ebx
1431 leaq irq_return_iret(%rip),%rcx 1423 leaq native_irq_return_iret(%rip),%rcx
1432 cmpq %rcx,RIP+8(%rsp) 1424 cmpq %rcx,RIP+8(%rsp)
1433 je error_swapgs 1425 je error_swapgs
1434 movl %ecx,%eax /* zero extend */ 1426 movl %ecx,%eax /* zero extend */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 6afbb16e9b79..94d857fb1033 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -175,7 +175,7 @@ void init_espfix_ap(void)
175 if (!pud_present(pud)) { 175 if (!pud_present(pud)) {
176 pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); 176 pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
177 pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); 177 pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
178 paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); 178 paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
179 for (n = 0; n < ESPFIX_PUD_CLONES; n++) 179 for (n = 0; n < ESPFIX_PUD_CLONES; n++)
180 set_pud(&pud_p[n], pud); 180 set_pud(&pud_p[n], pud);
181 } 181 }
@@ -185,7 +185,7 @@ void init_espfix_ap(void)
185 if (!pmd_present(pmd)) { 185 if (!pmd_present(pmd)) {
186 pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); 186 pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
187 pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); 187 pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
188 paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT); 188 paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
189 for (n = 0; n < ESPFIX_PMD_CLONES; n++) 189 for (n = 0; n < ESPFIX_PMD_CLONES; n++)
190 set_pmd(&pmd_p[n], pmd); 190 set_pmd(&pmd_p[n], pmd);
191 } 191 }
@@ -193,7 +193,6 @@ void init_espfix_ap(void)
193 pte_p = pte_offset_kernel(&pmd, addr); 193 pte_p = pte_offset_kernel(&pmd, addr);
194 stack_page = (void *)__get_free_page(GFP_KERNEL); 194 stack_page = (void *)__get_free_page(GFP_KERNEL);
195 pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); 195 pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
196 paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT);
197 for (n = 0; n < ESPFIX_PTE_CLONES; n++) 196 for (n = 0; n < ESPFIX_PTE_CLONES; n++)
198 set_pte(&pte_p[n*PTE_STRIDE], pte); 197 set_pte(&pte_p[n*PTE_STRIDE], pte);
199 198
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 7596df664901..67e6d19ef1be 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -574,6 +574,9 @@ int kprobe_int3_handler(struct pt_regs *regs)
574 struct kprobe *p; 574 struct kprobe *p;
575 struct kprobe_ctlblk *kcb; 575 struct kprobe_ctlblk *kcb;
576 576
577 if (user_mode_vm(regs))
578 return 0;
579
577 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); 580 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
578 /* 581 /*
579 * We don't want to be preempted for the entire 582 * We don't want to be preempted for the entire
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 3f08f34f93eb..a1da6737ba5b 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
6DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); 6DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
7DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq"); 7DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
8DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); 8DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
9DEF_NATIVE(pv_cpu_ops, iret, "iretq");
10DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); 9DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
11DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); 10DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
12DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); 11DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
@@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
50 PATCH_SITE(pv_irq_ops, save_fl); 49 PATCH_SITE(pv_irq_ops, save_fl);
51 PATCH_SITE(pv_irq_ops, irq_enable); 50 PATCH_SITE(pv_irq_ops, irq_enable);
52 PATCH_SITE(pv_irq_ops, irq_disable); 51 PATCH_SITE(pv_irq_ops, irq_disable);
53 PATCH_SITE(pv_cpu_ops, iret);
54 PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); 52 PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
55 PATCH_SITE(pv_cpu_ops, usergs_sysret32); 53 PATCH_SITE(pv_cpu_ops, usergs_sysret32);
56 PATCH_SITE(pv_cpu_ops, usergs_sysret64); 54 PATCH_SITE(pv_cpu_ops, usergs_sysret64);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 57e5ce126d5a..ea030319b321 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -920,9 +920,9 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
920 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 920 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
921 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) 921 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
922 mark_tsc_unstable("cpufreq changes"); 922 mark_tsc_unstable("cpufreq changes");
923 }
924 923
925 set_cyc2ns_scale(tsc_khz, freq->cpu); 924 set_cyc2ns_scale(tsc_khz, freq->cpu);
925 }
926 926
927 return 0; 927 return 0;
928} 928}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f6449334ec45..ef432f891d30 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5887,6 +5887,18 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
5887 kvm_x86_ops->set_nmi(vcpu); 5887 kvm_x86_ops->set_nmi(vcpu);
5888 } 5888 }
5889 } else if (kvm_cpu_has_injectable_intr(vcpu)) { 5889 } else if (kvm_cpu_has_injectable_intr(vcpu)) {
5890 /*
5891 * Because interrupts can be injected asynchronously, we are
5892 * calling check_nested_events again here to avoid a race condition.
5893 * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
5894 * proposal and current concerns. Perhaps we should be setting
5895 * KVM_REQ_EVENT only on certain events and not unconditionally?
5896 */
5897 if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
5898 r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
5899 if (r != 0)
5900 return r;
5901 }
5890 if (kvm_x86_ops->interrupt_allowed(vcpu)) { 5902 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
5891 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), 5903 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
5892 false); 5904 false);
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index c98583588580..ebfa9b2c871d 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -36,99 +36,133 @@
36 36
37#include <linux/sched.h> 37#include <linux/sched.h>
38#include <linux/mm.h> 38#include <linux/mm.h>
39#include <linux/slab.h>
39#include <linux/vmalloc.h> 40#include <linux/vmalloc.h>
40 41
41#include <xen/interface/xen.h> 42#include <xen/interface/xen.h>
42#include <xen/page.h> 43#include <xen/page.h>
43#include <xen/grant_table.h> 44#include <xen/grant_table.h>
45#include <xen/xen.h>
44 46
45#include <asm/pgtable.h> 47#include <asm/pgtable.h>
46 48
47static int map_pte_fn(pte_t *pte, struct page *pmd_page, 49static struct gnttab_vm_area {
48 unsigned long addr, void *data) 50 struct vm_struct *area;
51 pte_t **ptes;
52} gnttab_shared_vm_area, gnttab_status_vm_area;
53
54int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
55 unsigned long max_nr_gframes,
56 void **__shared)
49{ 57{
50 unsigned long **frames = (unsigned long **)data; 58 void *shared = *__shared;
59 unsigned long addr;
60 unsigned long i;
51 61
52 set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL)); 62 if (shared == NULL)
53 (*frames)++; 63 *__shared = shared = gnttab_shared_vm_area.area->addr;
54 return 0;
55}
56 64
57/* 65 addr = (unsigned long)shared;
58 * This function is used to map shared frames to store grant status. It is 66
59 * different from map_pte_fn above, the frames type here is uint64_t. 67 for (i = 0; i < nr_gframes; i++) {
60 */ 68 set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i],
61static int map_pte_fn_status(pte_t *pte, struct page *pmd_page, 69 mfn_pte(frames[i], PAGE_KERNEL));
62 unsigned long addr, void *data) 70 addr += PAGE_SIZE;
63{ 71 }
64 uint64_t **frames = (uint64_t **)data;
65 72
66 set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
67 (*frames)++;
68 return 0; 73 return 0;
69} 74}
70 75
71static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, 76int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
72 unsigned long addr, void *data) 77 unsigned long max_nr_gframes,
78 grant_status_t **__shared)
73{ 79{
80 grant_status_t *shared = *__shared;
81 unsigned long addr;
82 unsigned long i;
83
84 if (shared == NULL)
85 *__shared = shared = gnttab_status_vm_area.area->addr;
86
87 addr = (unsigned long)shared;
88
89 for (i = 0; i < nr_gframes; i++) {
90 set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i],
91 mfn_pte(frames[i], PAGE_KERNEL));
92 addr += PAGE_SIZE;
93 }
74 94
75 set_pte_at(&init_mm, addr, pte, __pte(0));
76 return 0; 95 return 0;
77} 96}
78 97
79int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, 98void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
80 unsigned long max_nr_gframes,
81 void **__shared)
82{ 99{
83 int rc; 100 pte_t **ptes;
84 void *shared = *__shared; 101 unsigned long addr;
102 unsigned long i;
85 103
86 if (shared == NULL) { 104 if (shared == gnttab_status_vm_area.area->addr)
87 struct vm_struct *area = 105 ptes = gnttab_status_vm_area.ptes;
88 alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL); 106 else
89 BUG_ON(area == NULL); 107 ptes = gnttab_shared_vm_area.ptes;
90 shared = area->addr;
91 *__shared = shared;
92 }
93 108
94 rc = apply_to_page_range(&init_mm, (unsigned long)shared, 109 addr = (unsigned long)shared;
95 PAGE_SIZE * nr_gframes, 110
96 map_pte_fn, &frames); 111 for (i = 0; i < nr_gframes; i++) {
97 return rc; 112 set_pte_at(&init_mm, addr, ptes[i], __pte(0));
113 addr += PAGE_SIZE;
114 }
98} 115}
99 116
100int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, 117static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
101 unsigned long max_nr_gframes,
102 grant_status_t **__shared)
103{ 118{
104 int rc; 119 area->ptes = kmalloc(sizeof(pte_t *) * nr_frames, GFP_KERNEL);
105 grant_status_t *shared = *__shared; 120 if (area->ptes == NULL)
121 return -ENOMEM;
106 122
107 if (shared == NULL) { 123 area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes);
108 /* No need to pass in PTE as we are going to do it 124 if (area->area == NULL) {
109 * in apply_to_page_range anyhow. */ 125 kfree(area->ptes);
110 struct vm_struct *area = 126 return -ENOMEM;
111 alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
112 BUG_ON(area == NULL);
113 shared = area->addr;
114 *__shared = shared;
115 } 127 }
116 128
117 rc = apply_to_page_range(&init_mm, (unsigned long)shared, 129 return 0;
118 PAGE_SIZE * nr_gframes,
119 map_pte_fn_status, &frames);
120 return rc;
121} 130}
122 131
123void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) 132static void arch_gnttab_vfree(struct gnttab_vm_area *area)
133{
134 free_vm_area(area->area);
135 kfree(area->ptes);
136}
137
138int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status)
124{ 139{
125 apply_to_page_range(&init_mm, (unsigned long)shared, 140 int ret;
126 PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); 141
142 if (!xen_pv_domain())
143 return 0;
144
145 ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
146 if (ret < 0)
147 return ret;
148
149 /*
150 * Always allocate the space for the status frames in case
151 * we're migrated to a host with V2 support.
152 */
153 ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status);
154 if (ret < 0)
155 goto err;
156
157 return 0;
158 err:
159 arch_gnttab_vfree(&gnttab_shared_vm_area);
160 return -ENOMEM;
127} 161}
162
128#ifdef CONFIG_XEN_PVH 163#ifdef CONFIG_XEN_PVH
129#include <xen/balloon.h> 164#include <xen/balloon.h>
130#include <xen/events.h> 165#include <xen/events.h>
131#include <xen/xen.h>
132#include <linux/slab.h> 166#include <linux/slab.h>
133static int __init xlated_setup_gnttab_pages(void) 167static int __init xlated_setup_gnttab_pages(void)
134{ 168{