aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2012-10-19 15:19:19 -0400
committerKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2012-10-19 15:19:19 -0400
commite05dacd71db0a5da7c1a44bcaab2a8a240b9c233 (patch)
tree31382cf1c7d62c03126448affb2fc86e8c4aaa8b /arch/x86/kernel
parent3ab0b83bf6a1e834f4b884150d8012990c75d25d (diff)
parentddffeb8c4d0331609ef2581d84de4d763607bd37 (diff)
Merge commit 'v3.7-rc1' into stable/for-linus-3.7
* commit 'v3.7-rc1': (10892 commits) Linux 3.7-rc1 x86, boot: Explicitly include autoconf.h for hostprogs perf: Fix UAPI fallout ARM: config: make sure that platforms are ordered by option string ARM: config: sort select statements alphanumerically UAPI: (Scripted) Disintegrate include/linux/byteorder UAPI: (Scripted) Disintegrate include/linux UAPI: Unexport linux/blk_types.h UAPI: Unexport part of linux/ppp-comp.h perf: Handle new rbtree implementation procfs: don't need a PATH_MAX allocation to hold a string representation of an int vfs: embed struct filename inside of names_cache allocation if possible audit: make audit_inode take struct filename vfs: make path_openat take a struct filename pointer vfs: turn do_path_lookup into wrapper around struct filename variant audit: allow audit code to satisfy getname requests from its names_list vfs: define struct filename and have getname() return it btrfs: Fix compilation with user namespace support enabled userns: Fix posix_acl_file_xattr_userns gid conversion userns: Properly print bluetooth socket uids ...
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile7
-rw-r--r--arch/x86/kernel/acpi/boot.c2
-rw-r--r--arch/x86/kernel/acpi/sleep.c15
-rw-r--r--arch/x86/kernel/alternative.c111
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c4
-rw-r--r--arch/x86/kernel/asm-offsets.c3
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/amd.c67
-rw-r--r--arch/x86/kernel/cpu/bugs.c7
-rw-r--r--arch/x86/kernel/cpu/common.c63
-rw-r--r--arch/x86/kernel/cpu/intel.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h12
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c94
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c168
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl5
-rw-r--r--arch/x86/kernel/cpu/perf_event.h4
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c73
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c27
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c14
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c3
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c36
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h6
-rw-r--r--arch/x86/kernel/cpu/perf_event_knc.c248
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/cpu/proc.c5
-rw-r--r--arch/x86/kernel/cpuid.c5
-rw-r--r--arch/x86/kernel/devicetree.c51
-rw-r--r--arch/x86/kernel/entry_32.S146
-rw-r--r--arch/x86/kernel/entry_64.S232
-rw-r--r--arch/x86/kernel/ftrace.c73
-rw-r--r--arch/x86/kernel/head_32.S31
-rw-r--r--arch/x86/kernel/i387.c292
-rw-r--r--arch/x86/kernel/i8259.c2
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kprobes.c67
-rw-r--r--arch/x86/kernel/kvm.c3
-rw-r--r--arch/x86/kernel/microcode_amd.c357
-rw-r--r--arch/x86/kernel/microcode_core.c70
-rw-r--r--arch/x86/kernel/microcode_intel.c3
-rw-r--r--arch/x86/kernel/msr.c5
-rw-r--r--arch/x86/kernel/perf_regs.c105
-rw-r--r--arch/x86/kernel/probe_roms.c2
-rw-r--r--arch/x86/kernel/process.c87
-rw-r--r--arch/x86/kernel/process_32.c37
-rw-r--r--arch/x86/kernel/process_64.c39
-rw-r--r--arch/x86/kernel/ptrace.c8
-rw-r--r--arch/x86/kernel/rtc.c2
-rw-r--r--arch/x86/kernel/setup.c5
-rw-r--r--arch/x86/kernel/signal.c233
-rw-r--r--arch/x86/kernel/smpboot.c20
-rw-r--r--arch/x86/kernel/step.c53
-rw-r--r--arch/x86/kernel/sys_i386_32.c40
-rw-r--r--arch/x86/kernel/traps.c174
-rw-r--r--arch/x86/kernel/uprobes.c52
-rw-r--r--arch/x86/kernel/vm86_32.c6
-rw-r--r--arch/x86/kernel/vsyscall_64.c49
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c6
-rw-r--r--arch/x86/kernel/xsave.c517
61 files changed, 2402 insertions, 1370 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8215e5652d97..91ce48f05f9f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,7 +23,7 @@ obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o
23obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o 23obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
24obj-$(CONFIG_IRQ_WORK) += irq_work.o 24obj-$(CONFIG_IRQ_WORK) += irq_work.o
25obj-y += probe_roms.o 25obj-y += probe_roms.o
26obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 26obj-$(CONFIG_X86_32) += i386_ksyms_32.o
27obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 27obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
28obj-y += syscall_$(BITS).o 28obj-y += syscall_$(BITS).o
29obj-$(CONFIG_X86_64) += vsyscall_64.o 29obj-$(CONFIG_X86_64) += vsyscall_64.o
@@ -81,8 +81,7 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
81obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 81obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
82obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o 82obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
83 83
84obj-$(CONFIG_KVM_GUEST) += kvm.o 84obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
85obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
86obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 85obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
87obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o 86obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
88obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 87obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
@@ -100,6 +99,8 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
100obj-$(CONFIG_OF) += devicetree.o 99obj-$(CONFIG_OF) += devicetree.o
101obj-$(CONFIG_UPROBES) += uprobes.o 100obj-$(CONFIG_UPROBES) += uprobes.o
102 101
102obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
103
103### 104###
104# 64 bit specific files 105# 64 bit specific files
105ifeq ($(CONFIG_X86_64),y) 106ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index b2297e58c6ed..e651f7a589ac 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -656,7 +656,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
656 acpi_register_lapic(physid, ACPI_MADT_ENABLED); 656 acpi_register_lapic(physid, ACPI_MADT_ENABLED);
657 657
658 /* 658 /*
659 * If mp_register_lapic successfully generates a new logical cpu 659 * If acpi_register_lapic successfully generates a new logical cpu
660 * number, then the following will get us exactly what was mapped 660 * number, then the following will get us exactly what was mapped
661 */ 661 */
662 cpumask_andnot(new_map, cpu_present_mask, tmp_map); 662 cpumask_andnot(new_map, cpu_present_mask, tmp_map);
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 1b8e5a03d942..11676cf65aee 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -43,17 +43,22 @@ int acpi_suspend_lowlevel(void)
43 43
44 header->video_mode = saved_video_mode; 44 header->video_mode = saved_video_mode;
45 45
46 header->pmode_behavior = 0;
47
46#ifndef CONFIG_64BIT 48#ifndef CONFIG_64BIT
47 store_gdt((struct desc_ptr *)&header->pmode_gdt); 49 store_gdt((struct desc_ptr *)&header->pmode_gdt);
48 50
49 if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low, 51 if (!rdmsr_safe(MSR_EFER,
50 &header->pmode_efer_high)) 52 &header->pmode_efer_low,
51 header->pmode_efer_low = header->pmode_efer_high = 0; 53 &header->pmode_efer_high))
54 header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER);
52#endif /* !CONFIG_64BIT */ 55#endif /* !CONFIG_64BIT */
53 56
54 header->pmode_cr0 = read_cr0(); 57 header->pmode_cr0 = read_cr0();
55 header->pmode_cr4 = read_cr4_safe(); 58 if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
56 header->pmode_behavior = 0; 59 header->pmode_cr4 = read_cr4();
60 header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
61 }
57 if (!rdmsr_safe(MSR_IA32_MISC_ENABLE, 62 if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
58 &header->pmode_misc_en_low, 63 &header->pmode_misc_en_low,
59 &header->pmode_misc_en_high)) 64 &header->pmode_misc_en_high))
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index ced4534baed5..ef5ccca79a6c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -23,19 +23,6 @@
23 23
24#define MAX_PATCH_LEN (255-1) 24#define MAX_PATCH_LEN (255-1)
25 25
26#ifdef CONFIG_HOTPLUG_CPU
27static int smp_alt_once;
28
29static int __init bootonly(char *str)
30{
31 smp_alt_once = 1;
32 return 1;
33}
34__setup("smp-alt-boot", bootonly);
35#else
36#define smp_alt_once 1
37#endif
38
39static int __initdata_or_module debug_alternative; 26static int __initdata_or_module debug_alternative;
40 27
41static int __init debug_alt(char *str) 28static int __init debug_alt(char *str)
@@ -317,7 +304,7 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
317 /* turn DS segment override prefix into lock prefix */ 304 /* turn DS segment override prefix into lock prefix */
318 if (*ptr == 0x3e) 305 if (*ptr == 0x3e)
319 text_poke(ptr, ((unsigned char []){0xf0}), 1); 306 text_poke(ptr, ((unsigned char []){0xf0}), 1);
320 }; 307 }
321 mutex_unlock(&text_mutex); 308 mutex_unlock(&text_mutex);
322} 309}
323 310
@@ -326,9 +313,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
326{ 313{
327 const s32 *poff; 314 const s32 *poff;
328 315
329 if (noreplace_smp)
330 return;
331
332 mutex_lock(&text_mutex); 316 mutex_lock(&text_mutex);
333 for (poff = start; poff < end; poff++) { 317 for (poff = start; poff < end; poff++) {
334 u8 *ptr = (u8 *)poff + *poff; 318 u8 *ptr = (u8 *)poff + *poff;
@@ -338,7 +322,7 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
338 /* turn lock prefix into DS segment override prefix */ 322 /* turn lock prefix into DS segment override prefix */
339 if (*ptr == 0xf0) 323 if (*ptr == 0xf0)
340 text_poke(ptr, ((unsigned char []){0x3E}), 1); 324 text_poke(ptr, ((unsigned char []){0x3E}), 1);
341 }; 325 }
342 mutex_unlock(&text_mutex); 326 mutex_unlock(&text_mutex);
343} 327}
344 328
@@ -359,7 +343,7 @@ struct smp_alt_module {
359}; 343};
360static LIST_HEAD(smp_alt_modules); 344static LIST_HEAD(smp_alt_modules);
361static DEFINE_MUTEX(smp_alt); 345static DEFINE_MUTEX(smp_alt);
362static int smp_mode = 1; /* protected by smp_alt */ 346static bool uniproc_patched = false; /* protected by smp_alt */
363 347
364void __init_or_module alternatives_smp_module_add(struct module *mod, 348void __init_or_module alternatives_smp_module_add(struct module *mod,
365 char *name, 349 char *name,
@@ -368,19 +352,18 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
368{ 352{
369 struct smp_alt_module *smp; 353 struct smp_alt_module *smp;
370 354
371 if (noreplace_smp) 355 mutex_lock(&smp_alt);
372 return; 356 if (!uniproc_patched)
357 goto unlock;
373 358
374 if (smp_alt_once) { 359 if (num_possible_cpus() == 1)
375 if (boot_cpu_has(X86_FEATURE_UP)) 360 /* Don't bother remembering, we'll never have to undo it. */
376 alternatives_smp_unlock(locks, locks_end, 361 goto smp_unlock;
377 text, text_end);
378 return;
379 }
380 362
381 smp = kzalloc(sizeof(*smp), GFP_KERNEL); 363 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
382 if (NULL == smp) 364 if (NULL == smp)
383 return; /* we'll run the (safe but slow) SMP code then ... */ 365 /* we'll run the (safe but slow) SMP code then ... */
366 goto unlock;
384 367
385 smp->mod = mod; 368 smp->mod = mod;
386 smp->name = name; 369 smp->name = name;
@@ -392,11 +375,10 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
392 __func__, smp->locks, smp->locks_end, 375 __func__, smp->locks, smp->locks_end,
393 smp->text, smp->text_end, smp->name); 376 smp->text, smp->text_end, smp->name);
394 377
395 mutex_lock(&smp_alt);
396 list_add_tail(&smp->next, &smp_alt_modules); 378 list_add_tail(&smp->next, &smp_alt_modules);
397 if (boot_cpu_has(X86_FEATURE_UP)) 379smp_unlock:
398 alternatives_smp_unlock(smp->locks, smp->locks_end, 380 alternatives_smp_unlock(locks, locks_end, text, text_end);
399 smp->text, smp->text_end); 381unlock:
400 mutex_unlock(&smp_alt); 382 mutex_unlock(&smp_alt);
401} 383}
402 384
@@ -404,24 +386,18 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
404{ 386{
405 struct smp_alt_module *item; 387 struct smp_alt_module *item;
406 388
407 if (smp_alt_once || noreplace_smp)
408 return;
409
410 mutex_lock(&smp_alt); 389 mutex_lock(&smp_alt);
411 list_for_each_entry(item, &smp_alt_modules, next) { 390 list_for_each_entry(item, &smp_alt_modules, next) {
412 if (mod != item->mod) 391 if (mod != item->mod)
413 continue; 392 continue;
414 list_del(&item->next); 393 list_del(&item->next);
415 mutex_unlock(&smp_alt);
416 DPRINTK("%s: %s\n", __func__, item->name);
417 kfree(item); 394 kfree(item);
418 return; 395 break;
419 } 396 }
420 mutex_unlock(&smp_alt); 397 mutex_unlock(&smp_alt);
421} 398}
422 399
423bool skip_smp_alternatives; 400void alternatives_enable_smp(void)
424void alternatives_smp_switch(int smp)
425{ 401{
426 struct smp_alt_module *mod; 402 struct smp_alt_module *mod;
427 403
@@ -436,34 +412,21 @@ void alternatives_smp_switch(int smp)
436 pr_info("lockdep: fixing up alternatives\n"); 412 pr_info("lockdep: fixing up alternatives\n");
437#endif 413#endif
438 414
439 if (noreplace_smp || smp_alt_once || skip_smp_alternatives) 415 /* Why bother if there are no other CPUs? */
440 return; 416 BUG_ON(num_possible_cpus() == 1);
441 BUG_ON(!smp && (num_online_cpus() > 1));
442 417
443 mutex_lock(&smp_alt); 418 mutex_lock(&smp_alt);
444 419
445 /* 420 if (uniproc_patched) {
446 * Avoid unnecessary switches because it forces JIT based VMs to
447 * throw away all cached translations, which can be quite costly.
448 */
449 if (smp == smp_mode) {
450 /* nothing */
451 } else if (smp) {
452 pr_info("switching to SMP code\n"); 421 pr_info("switching to SMP code\n");
422 BUG_ON(num_online_cpus() != 1);
453 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 423 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
454 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 424 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
455 list_for_each_entry(mod, &smp_alt_modules, next) 425 list_for_each_entry(mod, &smp_alt_modules, next)
456 alternatives_smp_lock(mod->locks, mod->locks_end, 426 alternatives_smp_lock(mod->locks, mod->locks_end,
457 mod->text, mod->text_end); 427 mod->text, mod->text_end);
458 } else { 428 uniproc_patched = false;
459 pr_info("switching to UP code\n");
460 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
461 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
462 list_for_each_entry(mod, &smp_alt_modules, next)
463 alternatives_smp_unlock(mod->locks, mod->locks_end,
464 mod->text, mod->text_end);
465 } 429 }
466 smp_mode = smp;
467 mutex_unlock(&smp_alt); 430 mutex_unlock(&smp_alt);
468} 431}
469 432
@@ -540,40 +503,22 @@ void __init alternative_instructions(void)
540 503
541 apply_alternatives(__alt_instructions, __alt_instructions_end); 504 apply_alternatives(__alt_instructions, __alt_instructions_end);
542 505
543 /* switch to patch-once-at-boottime-only mode and free the
544 * tables in case we know the number of CPUs will never ever
545 * change */
546#ifdef CONFIG_HOTPLUG_CPU
547 if (num_possible_cpus() < 2)
548 smp_alt_once = 1;
549#endif
550
551#ifdef CONFIG_SMP 506#ifdef CONFIG_SMP
552 if (smp_alt_once) { 507 /* Patch to UP if other cpus not imminent. */
553 if (1 == num_possible_cpus()) { 508 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
554 pr_info("switching to UP code\n"); 509 uniproc_patched = true;
555 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
556 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
557
558 alternatives_smp_unlock(__smp_locks, __smp_locks_end,
559 _text, _etext);
560 }
561 } else {
562 alternatives_smp_module_add(NULL, "core kernel", 510 alternatives_smp_module_add(NULL, "core kernel",
563 __smp_locks, __smp_locks_end, 511 __smp_locks, __smp_locks_end,
564 _text, _etext); 512 _text, _etext);
565
566 /* Only switch to UP mode if we don't immediately boot others */
567 if (num_present_cpus() == 1 || setup_max_cpus <= 1)
568 alternatives_smp_switch(0);
569 } 513 }
570#endif
571 apply_paravirt(__parainstructions, __parainstructions_end);
572 514
573 if (smp_alt_once) 515 if (!uniproc_patched || num_possible_cpus() == 1)
574 free_init_pages("SMP alternatives", 516 free_init_pages("SMP alternatives",
575 (unsigned long)__smp_locks, 517 (unsigned long)__smp_locks,
576 (unsigned long)__smp_locks_end); 518 (unsigned long)__smp_locks_end);
519#endif
520
521 apply_paravirt(__parainstructions, __parainstructions_end);
577 522
578 restart_nmi(); 523 restart_nmi();
579} 524}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 24deb3082328..b17416e72fbd 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1934,7 +1934,7 @@ void smp_error_interrupt(struct pt_regs *regs)
1934 apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); 1934 apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
1935 i++; 1935 i++;
1936 v1 >>= 1; 1936 v1 >>= 1;
1937 }; 1937 }
1938 1938
1939 apic_printk(APIC_DEBUG, KERN_CONT "\n"); 1939 apic_printk(APIC_DEBUG, KERN_CONT "\n");
1940 1940
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index bc552cff2578..a65829ac2b9a 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -30,7 +30,7 @@
30 30
31static int numachip_system __read_mostly; 31static int numachip_system __read_mostly;
32 32
33static struct apic apic_numachip __read_mostly; 33static const struct apic apic_numachip __read_mostly;
34 34
35static unsigned int get_apic_id(unsigned long x) 35static unsigned int get_apic_id(unsigned long x)
36{ 36{
@@ -199,7 +199,7 @@ static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
199 return 0; 199 return 0;
200} 200}
201 201
202static struct apic apic_numachip __refconst = { 202static const struct apic apic_numachip __refconst = {
203 203
204 .name = "NumaConnect system", 204 .name = "NumaConnect system",
205 .probe = numachip_probe, 205 .probe = numachip_probe,
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 68de2dc962ec..28610822fb3c 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -69,4 +69,7 @@ void common(void) {
69 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); 69 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
70 OFFSET(BP_pref_address, boot_params, hdr.pref_address); 70 OFFSET(BP_pref_address, boot_params, hdr.pref_address);
71 OFFSET(BP_code32_start, boot_params, hdr.code32_start); 71 OFFSET(BP_code32_start, boot_params, hdr.code32_start);
72
73 BLANK();
74 DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
72} 75}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d30a6a9a0121..a0e067d3d96c 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,7 +32,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
32 32
33ifdef CONFIG_PERF_EVENTS 33ifdef CONFIG_PERF_EVENTS
34obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o 34obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o
35obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o 35obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
36obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o 36obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
37obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o 37obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o
38endif 38endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 9d92e19039f0..f7e98a2c0d12 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -737,6 +737,72 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c,
737} 737}
738#endif 738#endif
739 739
740static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
741{
742 if (!cpu_has_invlpg)
743 return;
744
745 tlb_flushall_shift = 5;
746
747 if (c->x86 <= 0x11)
748 tlb_flushall_shift = 4;
749}
750
751static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
752{
753 u32 ebx, eax, ecx, edx;
754 u16 mask = 0xfff;
755
756 if (c->x86 < 0xf)
757 return;
758
759 if (c->extended_cpuid_level < 0x80000006)
760 return;
761
762 cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
763
764 tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
765 tlb_lli_4k[ENTRIES] = ebx & mask;
766
767 /*
768 * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
769 * characteristics from the CPUID function 0x80000005 instead.
770 */
771 if (c->x86 == 0xf) {
772 cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
773 mask = 0xff;
774 }
775
776 /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
777 if (!((eax >> 16) & mask)) {
778 u32 a, b, c, d;
779
780 cpuid(0x80000005, &a, &b, &c, &d);
781 tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff;
782 } else {
783 tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
784 }
785
786 /* a 4M entry uses two 2M entries */
787 tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
788
789 /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
790 if (!(eax & mask)) {
791 /* Erratum 658 */
792 if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
793 tlb_lli_2m[ENTRIES] = 1024;
794 } else {
795 cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
796 tlb_lli_2m[ENTRIES] = eax & 0xff;
797 }
798 } else
799 tlb_lli_2m[ENTRIES] = eax & mask;
800
801 tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
802
803 cpu_set_tlb_flushall_shift(c);
804}
805
740static const struct cpu_dev __cpuinitconst amd_cpu_dev = { 806static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
741 .c_vendor = "AMD", 807 .c_vendor = "AMD",
742 .c_ident = { "AuthenticAMD" }, 808 .c_ident = { "AuthenticAMD" },
@@ -756,6 +822,7 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
756 .c_size_cache = amd_size_cache, 822 .c_size_cache = amd_size_cache,
757#endif 823#endif
758 .c_early_init = early_init_amd, 824 .c_early_init = early_init_amd,
825 .c_detect_tlb = cpu_detect_tlb_amd,
759 .c_bsp_init = bsp_init_amd, 826 .c_bsp_init = bsp_init_amd,
760 .c_init = init_amd, 827 .c_init = init_amd,
761 .c_x86_vendor = X86_VENDOR_AMD, 828 .c_x86_vendor = X86_VENDOR_AMD,
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c97bb7b5a9f8..d0e910da16c5 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -165,10 +165,15 @@ void __init check_bugs(void)
165 print_cpu_info(&boot_cpu_data); 165 print_cpu_info(&boot_cpu_data);
166#endif 166#endif
167 check_config(); 167 check_config();
168 check_fpu();
169 check_hlt(); 168 check_hlt();
170 check_popad(); 169 check_popad();
171 init_utsname()->machine[1] = 170 init_utsname()->machine[1] =
172 '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); 171 '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
173 alternative_instructions(); 172 alternative_instructions();
173
174 /*
175 * kernel_fpu_begin/end() in check_fpu() relies on the patched
176 * alternative instructions.
177 */
178 check_fpu();
174} 179}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a5fbc3c5fccc..7505f7b13e71 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -259,23 +259,36 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
259} 259}
260#endif 260#endif
261 261
262static int disable_smep __cpuinitdata;
263static __init int setup_disable_smep(char *arg) 262static __init int setup_disable_smep(char *arg)
264{ 263{
265 disable_smep = 1; 264 setup_clear_cpu_cap(X86_FEATURE_SMEP);
266 return 1; 265 return 1;
267} 266}
268__setup("nosmep", setup_disable_smep); 267__setup("nosmep", setup_disable_smep);
269 268
270static __cpuinit void setup_smep(struct cpuinfo_x86 *c) 269static __always_inline void setup_smep(struct cpuinfo_x86 *c)
271{ 270{
272 if (cpu_has(c, X86_FEATURE_SMEP)) { 271 if (cpu_has(c, X86_FEATURE_SMEP))
273 if (unlikely(disable_smep)) { 272 set_in_cr4(X86_CR4_SMEP);
274 setup_clear_cpu_cap(X86_FEATURE_SMEP); 273}
275 clear_in_cr4(X86_CR4_SMEP); 274
276 } else 275static __init int setup_disable_smap(char *arg)
277 set_in_cr4(X86_CR4_SMEP); 276{
278 } 277 setup_clear_cpu_cap(X86_FEATURE_SMAP);
278 return 1;
279}
280__setup("nosmap", setup_disable_smap);
281
282static __always_inline void setup_smap(struct cpuinfo_x86 *c)
283{
284 unsigned long eflags;
285
286 /* This should have been cleared long ago */
287 raw_local_save_flags(eflags);
288 BUG_ON(eflags & X86_EFLAGS_AC);
289
290 if (cpu_has(c, X86_FEATURE_SMAP))
291 set_in_cr4(X86_CR4_SMAP);
279} 292}
280 293
281/* 294/*
@@ -476,7 +489,7 @@ void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c)
476 489
477 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ 490 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
478 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ 491 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
479 "tlb_flushall_shift is 0x%x\n", 492 "tlb_flushall_shift: %d\n",
480 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], 493 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
481 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], 494 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
482 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], 495 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
@@ -712,8 +725,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
712 c->cpu_index = 0; 725 c->cpu_index = 0;
713 filter_cpuid_features(c, false); 726 filter_cpuid_features(c, false);
714 727
715 setup_smep(c);
716
717 if (this_cpu->c_bsp_init) 728 if (this_cpu->c_bsp_init)
718 this_cpu->c_bsp_init(c); 729 this_cpu->c_bsp_init(c);
719} 730}
@@ -798,8 +809,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
798 c->phys_proc_id = c->initial_apicid; 809 c->phys_proc_id = c->initial_apicid;
799 } 810 }
800 811
801 setup_smep(c);
802
803 get_model_name(c); /* Default name */ 812 get_model_name(c); /* Default name */
804 813
805 detect_nopl(c); 814 detect_nopl(c);
@@ -864,6 +873,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
864 /* Disable the PN if appropriate */ 873 /* Disable the PN if appropriate */
865 squash_the_stupid_serial_number(c); 874 squash_the_stupid_serial_number(c);
866 875
876 /* Set up SMEP/SMAP */
877 setup_smep(c);
878 setup_smap(c);
879
867 /* 880 /*
868 * The vendor-specific functions might have changed features. 881 * The vendor-specific functions might have changed features.
869 * Now we do "generic changes." 882 * Now we do "generic changes."
@@ -942,8 +955,7 @@ void __init identify_boot_cpu(void)
942#else 955#else
943 vgetcpu_set_mode(); 956 vgetcpu_set_mode();
944#endif 957#endif
945 if (boot_cpu_data.cpuid_level >= 2) 958 cpu_detect_tlb(&boot_cpu_data);
946 cpu_detect_tlb(&boot_cpu_data);
947} 959}
948 960
949void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 961void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -1023,14 +1035,16 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1023 printk(KERN_CONT "%s ", vendor); 1035 printk(KERN_CONT "%s ", vendor);
1024 1036
1025 if (c->x86_model_id[0]) 1037 if (c->x86_model_id[0])
1026 printk(KERN_CONT "%s", c->x86_model_id); 1038 printk(KERN_CONT "%s", strim(c->x86_model_id));
1027 else 1039 else
1028 printk(KERN_CONT "%d86", c->x86); 1040 printk(KERN_CONT "%d86", c->x86);
1029 1041
1042 printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model);
1043
1030 if (c->x86_mask || c->cpuid_level >= 0) 1044 if (c->x86_mask || c->cpuid_level >= 0)
1031 printk(KERN_CONT " stepping %02x\n", c->x86_mask); 1045 printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask);
1032 else 1046 else
1033 printk(KERN_CONT "\n"); 1047 printk(KERN_CONT ")\n");
1034 1048
1035 print_cpu_msr(c); 1049 print_cpu_msr(c);
1036} 1050}
@@ -1113,11 +1127,10 @@ void syscall_init(void)
1113 1127
1114 /* Flags to clear on syscall */ 1128 /* Flags to clear on syscall */
1115 wrmsrl(MSR_SYSCALL_MASK, 1129 wrmsrl(MSR_SYSCALL_MASK,
1116 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); 1130 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
1131 X86_EFLAGS_IOPL|X86_EFLAGS_AC);
1117} 1132}
1118 1133
1119unsigned long kernel_eflags;
1120
1121/* 1134/*
1122 * Copies of the original ist values from the tss are only accessed during 1135 * Copies of the original ist values from the tss are only accessed during
1123 * debugging, no special alignment required. 1136 * debugging, no special alignment required.
@@ -1297,9 +1310,6 @@ void __cpuinit cpu_init(void)
1297 dbg_restore_debug_regs(); 1310 dbg_restore_debug_regs();
1298 1311
1299 fpu_init(); 1312 fpu_init();
1300 xsave_init();
1301
1302 raw_local_save_flags(kernel_eflags);
1303 1313
1304 if (is_uv_system()) 1314 if (is_uv_system())
1305 uv_cpu_init(); 1315 uv_cpu_init();
@@ -1352,6 +1362,5 @@ void __cpuinit cpu_init(void)
1352 dbg_restore_debug_regs(); 1362 dbg_restore_debug_regs();
1353 1363
1354 fpu_init(); 1364 fpu_init();
1355 xsave_init();
1356} 1365}
1357#endif 1366#endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 0a4ce2980a5a..198e019a531a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -648,6 +648,10 @@ static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c)
648 int i, j, n; 648 int i, j, n;
649 unsigned int regs[4]; 649 unsigned int regs[4];
650 unsigned char *desc = (unsigned char *)regs; 650 unsigned char *desc = (unsigned char *)regs;
651
652 if (c->cpuid_level < 2)
653 return;
654
651 /* Number of times to iterate */ 655 /* Number of times to iterate */
652 n = cpuid_eax(2) & 0xFF; 656 n = cpuid_eax(2) & 0xFF;
653 657
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index fc4beb393577..ddc72f839332 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -78,6 +78,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
78} 78}
79 79
80static cpumask_var_t mce_inject_cpumask; 80static cpumask_var_t mce_inject_cpumask;
81static DEFINE_MUTEX(mce_inject_mutex);
81 82
82static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) 83static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
83{ 84{
@@ -194,7 +195,11 @@ static void raise_mce(struct mce *m)
194 put_online_cpus(); 195 put_online_cpus();
195 } else 196 } else
196#endif 197#endif
198 {
199 preempt_disable();
197 raise_local(); 200 raise_local();
201 preempt_enable();
202 }
198} 203}
199 204
200/* Error injection interface */ 205/* Error injection interface */
@@ -225,7 +230,10 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
225 * so do it a jiffie or two later everywhere. 230 * so do it a jiffie or two later everywhere.
226 */ 231 */
227 schedule_timeout(2); 232 schedule_timeout(2);
233
234 mutex_lock(&mce_inject_mutex);
228 raise_mce(&m); 235 raise_mce(&m);
236 mutex_unlock(&mce_inject_mutex);
229 return usize; 237 return usize;
230} 238}
231 239
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index ed44c8a65858..6a05c1d327a9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,6 +28,18 @@ extern int mce_ser;
28 28
29extern struct mce_bank *mce_banks; 29extern struct mce_bank *mce_banks;
30 30
31#ifdef CONFIG_X86_MCE_INTEL
32unsigned long mce_intel_adjust_timer(unsigned long interval);
33void mce_intel_cmci_poll(void);
34void mce_intel_hcpu_update(unsigned long cpu);
35#else
36# define mce_intel_adjust_timer mce_adjust_timer_default
37static inline void mce_intel_cmci_poll(void) { }
38static inline void mce_intel_hcpu_update(unsigned long cpu) { }
39#endif
40
41void mce_timer_kick(unsigned long interval);
42
31#ifdef CONFIG_ACPI_APEI 43#ifdef CONFIG_ACPI_APEI
32int apei_write_mce(struct mce *m); 44int apei_write_mce(struct mce *m);
33ssize_t apei_read_mce(struct mce *m, u64 *record_id); 45ssize_t apei_read_mce(struct mce *m, u64 *record_id);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 292d0258311c..29e87d3b2843 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -83,6 +83,7 @@ static int mce_dont_log_ce __read_mostly;
83int mce_cmci_disabled __read_mostly; 83int mce_cmci_disabled __read_mostly;
84int mce_ignore_ce __read_mostly; 84int mce_ignore_ce __read_mostly;
85int mce_ser __read_mostly; 85int mce_ser __read_mostly;
86int mce_bios_cmci_threshold __read_mostly;
86 87
87struct mce_bank *mce_banks __read_mostly; 88struct mce_bank *mce_banks __read_mostly;
88 89
@@ -1266,6 +1267,14 @@ static unsigned long check_interval = 5 * 60; /* 5 minutes */
1266static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ 1267static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1267static DEFINE_PER_CPU(struct timer_list, mce_timer); 1268static DEFINE_PER_CPU(struct timer_list, mce_timer);
1268 1269
1270static unsigned long mce_adjust_timer_default(unsigned long interval)
1271{
1272 return interval;
1273}
1274
1275static unsigned long (*mce_adjust_timer)(unsigned long interval) =
1276 mce_adjust_timer_default;
1277
1269static void mce_timer_fn(unsigned long data) 1278static void mce_timer_fn(unsigned long data)
1270{ 1279{
1271 struct timer_list *t = &__get_cpu_var(mce_timer); 1280 struct timer_list *t = &__get_cpu_var(mce_timer);
@@ -1276,6 +1285,7 @@ static void mce_timer_fn(unsigned long data)
1276 if (mce_available(__this_cpu_ptr(&cpu_info))) { 1285 if (mce_available(__this_cpu_ptr(&cpu_info))) {
1277 machine_check_poll(MCP_TIMESTAMP, 1286 machine_check_poll(MCP_TIMESTAMP,
1278 &__get_cpu_var(mce_poll_banks)); 1287 &__get_cpu_var(mce_poll_banks));
1288 mce_intel_cmci_poll();
1279 } 1289 }
1280 1290
1281 /* 1291 /*
@@ -1283,14 +1293,38 @@ static void mce_timer_fn(unsigned long data)
1283 * polling interval, otherwise increase the polling interval. 1293 * polling interval, otherwise increase the polling interval.
1284 */ 1294 */
1285 iv = __this_cpu_read(mce_next_interval); 1295 iv = __this_cpu_read(mce_next_interval);
1286 if (mce_notify_irq()) 1296 if (mce_notify_irq()) {
1287 iv = max(iv / 2, (unsigned long) HZ/100); 1297 iv = max(iv / 2, (unsigned long) HZ/100);
1288 else 1298 } else {
1289 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); 1299 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1300 iv = mce_adjust_timer(iv);
1301 }
1290 __this_cpu_write(mce_next_interval, iv); 1302 __this_cpu_write(mce_next_interval, iv);
1303 /* Might have become 0 after CMCI storm subsided */
1304 if (iv) {
1305 t->expires = jiffies + iv;
1306 add_timer_on(t, smp_processor_id());
1307 }
1308}
1291 1309
1292 t->expires = jiffies + iv; 1310/*
1293 add_timer_on(t, smp_processor_id()); 1311 * Ensure that the timer is firing in @interval from now.
1312 */
1313void mce_timer_kick(unsigned long interval)
1314{
1315 struct timer_list *t = &__get_cpu_var(mce_timer);
1316 unsigned long when = jiffies + interval;
1317 unsigned long iv = __this_cpu_read(mce_next_interval);
1318
1319 if (timer_pending(t)) {
1320 if (time_before(when, t->expires))
1321 mod_timer_pinned(t, when);
1322 } else {
1323 t->expires = round_jiffies(when);
1324 add_timer_on(t, smp_processor_id());
1325 }
1326 if (interval < iv)
1327 __this_cpu_write(mce_next_interval, interval);
1294} 1328}
1295 1329
1296/* Must not be called in IRQ context where del_timer_sync() can deadlock */ 1330/* Must not be called in IRQ context where del_timer_sync() can deadlock */
@@ -1585,6 +1619,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1585 switch (c->x86_vendor) { 1619 switch (c->x86_vendor) {
1586 case X86_VENDOR_INTEL: 1620 case X86_VENDOR_INTEL:
1587 mce_intel_feature_init(c); 1621 mce_intel_feature_init(c);
1622 mce_adjust_timer = mce_intel_adjust_timer;
1588 break; 1623 break;
1589 case X86_VENDOR_AMD: 1624 case X86_VENDOR_AMD:
1590 mce_amd_feature_init(c); 1625 mce_amd_feature_init(c);
@@ -1594,23 +1629,28 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1594 } 1629 }
1595} 1630}
1596 1631
1597static void __mcheck_cpu_init_timer(void) 1632static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1598{ 1633{
1599 struct timer_list *t = &__get_cpu_var(mce_timer); 1634 unsigned long iv = mce_adjust_timer(check_interval * HZ);
1600 unsigned long iv = check_interval * HZ;
1601 1635
1602 setup_timer(t, mce_timer_fn, smp_processor_id()); 1636 __this_cpu_write(mce_next_interval, iv);
1603 1637
1604 if (mce_ignore_ce) 1638 if (mce_ignore_ce || !iv)
1605 return; 1639 return;
1606 1640
1607 __this_cpu_write(mce_next_interval, iv);
1608 if (!iv)
1609 return;
1610 t->expires = round_jiffies(jiffies + iv); 1641 t->expires = round_jiffies(jiffies + iv);
1611 add_timer_on(t, smp_processor_id()); 1642 add_timer_on(t, smp_processor_id());
1612} 1643}
1613 1644
1645static void __mcheck_cpu_init_timer(void)
1646{
1647 struct timer_list *t = &__get_cpu_var(mce_timer);
1648 unsigned int cpu = smp_processor_id();
1649
1650 setup_timer(t, mce_timer_fn, cpu);
1651 mce_start_timer(cpu, t);
1652}
1653
1614/* Handle unconfigured int18 (should never happen) */ 1654/* Handle unconfigured int18 (should never happen) */
1615static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1655static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1616{ 1656{
@@ -1907,6 +1947,7 @@ static struct miscdevice mce_chrdev_device = {
1907 * check, or 0 to not wait 1947 * check, or 0 to not wait
1908 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1948 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1909 * mce=nobootlog Don't log MCEs from before booting. 1949 * mce=nobootlog Don't log MCEs from before booting.
1950 * mce=bios_cmci_threshold Don't program the CMCI threshold
1910 */ 1951 */
1911static int __init mcheck_enable(char *str) 1952static int __init mcheck_enable(char *str)
1912{ 1953{
@@ -1926,6 +1967,8 @@ static int __init mcheck_enable(char *str)
1926 mce_ignore_ce = 1; 1967 mce_ignore_ce = 1;
1927 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1968 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1928 mce_bootlog = (str[0] == 'b'); 1969 mce_bootlog = (str[0] == 'b');
1970 else if (!strcmp(str, "bios_cmci_threshold"))
1971 mce_bios_cmci_threshold = 1;
1929 else if (isdigit(str[0])) { 1972 else if (isdigit(str[0])) {
1930 get_option(&str, &tolerant); 1973 get_option(&str, &tolerant);
1931 if (*str == ',') { 1974 if (*str == ',') {
@@ -2166,6 +2209,11 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
2166 &mce_cmci_disabled 2209 &mce_cmci_disabled
2167}; 2210};
2168 2211
2212static struct dev_ext_attribute dev_attr_bios_cmci_threshold = {
2213 __ATTR(bios_cmci_threshold, 0444, device_show_int, NULL),
2214 &mce_bios_cmci_threshold
2215};
2216
2169static struct device_attribute *mce_device_attrs[] = { 2217static struct device_attribute *mce_device_attrs[] = {
2170 &dev_attr_tolerant.attr, 2218 &dev_attr_tolerant.attr,
2171 &dev_attr_check_interval.attr, 2219 &dev_attr_check_interval.attr,
@@ -2174,6 +2222,7 @@ static struct device_attribute *mce_device_attrs[] = {
2174 &dev_attr_dont_log_ce.attr, 2222 &dev_attr_dont_log_ce.attr,
2175 &dev_attr_ignore_ce.attr, 2223 &dev_attr_ignore_ce.attr,
2176 &dev_attr_cmci_disabled.attr, 2224 &dev_attr_cmci_disabled.attr,
2225 &dev_attr_bios_cmci_threshold.attr,
2177 NULL 2226 NULL
2178}; 2227};
2179 2228
@@ -2294,38 +2343,33 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2294 unsigned int cpu = (unsigned long)hcpu; 2343 unsigned int cpu = (unsigned long)hcpu;
2295 struct timer_list *t = &per_cpu(mce_timer, cpu); 2344 struct timer_list *t = &per_cpu(mce_timer, cpu);
2296 2345
2297 switch (action) { 2346 switch (action & ~CPU_TASKS_FROZEN) {
2298 case CPU_ONLINE: 2347 case CPU_ONLINE:
2299 case CPU_ONLINE_FROZEN:
2300 mce_device_create(cpu); 2348 mce_device_create(cpu);
2301 if (threshold_cpu_callback) 2349 if (threshold_cpu_callback)
2302 threshold_cpu_callback(action, cpu); 2350 threshold_cpu_callback(action, cpu);
2303 break; 2351 break;
2304 case CPU_DEAD: 2352 case CPU_DEAD:
2305 case CPU_DEAD_FROZEN:
2306 if (threshold_cpu_callback) 2353 if (threshold_cpu_callback)
2307 threshold_cpu_callback(action, cpu); 2354 threshold_cpu_callback(action, cpu);
2308 mce_device_remove(cpu); 2355 mce_device_remove(cpu);
2356 mce_intel_hcpu_update(cpu);
2309 break; 2357 break;
2310 case CPU_DOWN_PREPARE: 2358 case CPU_DOWN_PREPARE:
2311 case CPU_DOWN_PREPARE_FROZEN:
2312 del_timer_sync(t);
2313 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2359 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2360 del_timer_sync(t);
2314 break; 2361 break;
2315 case CPU_DOWN_FAILED: 2362 case CPU_DOWN_FAILED:
2316 case CPU_DOWN_FAILED_FROZEN:
2317 if (!mce_ignore_ce && check_interval) {
2318 t->expires = round_jiffies(jiffies +
2319 per_cpu(mce_next_interval, cpu));
2320 add_timer_on(t, cpu);
2321 }
2322 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2363 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2364 mce_start_timer(cpu, t);
2323 break; 2365 break;
2324 case CPU_POST_DEAD: 2366 }
2367
2368 if (action == CPU_POST_DEAD) {
2325 /* intentionally ignoring frozen here */ 2369 /* intentionally ignoring frozen here */
2326 cmci_rediscover(cpu); 2370 cmci_rediscover(cpu);
2327 break;
2328 } 2371 }
2372
2329 return NOTIFY_OK; 2373 return NOTIFY_OK;
2330} 2374}
2331 2375
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 38e49bc95ffc..5f88abf07e9c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -15,6 +15,8 @@
15#include <asm/msr.h> 15#include <asm/msr.h>
16#include <asm/mce.h> 16#include <asm/mce.h>
17 17
18#include "mce-internal.h"
19
18/* 20/*
19 * Support for Intel Correct Machine Check Interrupts. This allows 21 * Support for Intel Correct Machine Check Interrupts. This allows
20 * the CPU to raise an interrupt when a corrected machine check happened. 22 * the CPU to raise an interrupt when a corrected machine check happened.
@@ -30,7 +32,22 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
30 */ 32 */
31static DEFINE_RAW_SPINLOCK(cmci_discover_lock); 33static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
32 34
33#define CMCI_THRESHOLD 1 35#define CMCI_THRESHOLD 1
36#define CMCI_POLL_INTERVAL (30 * HZ)
37#define CMCI_STORM_INTERVAL (1 * HZ)
38#define CMCI_STORM_THRESHOLD 15
39
40static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
41static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
42static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
43
44enum {
45 CMCI_STORM_NONE,
46 CMCI_STORM_ACTIVE,
47 CMCI_STORM_SUBSIDED,
48};
49
50static atomic_t cmci_storm_on_cpus;
34 51
35static int cmci_supported(int *banks) 52static int cmci_supported(int *banks)
36{ 53{
@@ -53,6 +70,93 @@ static int cmci_supported(int *banks)
53 return !!(cap & MCG_CMCI_P); 70 return !!(cap & MCG_CMCI_P);
54} 71}
55 72
73void mce_intel_cmci_poll(void)
74{
75 if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
76 return;
77 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
78}
79
80void mce_intel_hcpu_update(unsigned long cpu)
81{
82 if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
83 atomic_dec(&cmci_storm_on_cpus);
84
85 per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
86}
87
88unsigned long mce_intel_adjust_timer(unsigned long interval)
89{
90 int r;
91
92 if (interval < CMCI_POLL_INTERVAL)
93 return interval;
94
95 switch (__this_cpu_read(cmci_storm_state)) {
96 case CMCI_STORM_ACTIVE:
97 /*
98 * We switch back to interrupt mode once the poll timer has
99 * silenced itself. That means no events recorded and the
100 * timer interval is back to our poll interval.
101 */
102 __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
103 r = atomic_sub_return(1, &cmci_storm_on_cpus);
104 if (r == 0)
105 pr_notice("CMCI storm subsided: switching to interrupt mode\n");
106 /* FALLTHROUGH */
107
108 case CMCI_STORM_SUBSIDED:
109 /*
110 * We wait for all cpus to go back to SUBSIDED
111 * state. When that happens we switch back to
112 * interrupt mode.
113 */
114 if (!atomic_read(&cmci_storm_on_cpus)) {
115 __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
116 cmci_reenable();
117 cmci_recheck();
118 }
119 return CMCI_POLL_INTERVAL;
120 default:
121 /*
122 * We have shiny weather. Let the poll do whatever it
123 * thinks.
124 */
125 return interval;
126 }
127}
128
129static bool cmci_storm_detect(void)
130{
131 unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
132 unsigned long ts = __this_cpu_read(cmci_time_stamp);
133 unsigned long now = jiffies;
134 int r;
135
136 if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
137 return true;
138
139 if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
140 cnt++;
141 } else {
142 cnt = 1;
143 __this_cpu_write(cmci_time_stamp, now);
144 }
145 __this_cpu_write(cmci_storm_cnt, cnt);
146
147 if (cnt <= CMCI_STORM_THRESHOLD)
148 return false;
149
150 cmci_clear();
151 __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
152 r = atomic_add_return(1, &cmci_storm_on_cpus);
153 mce_timer_kick(CMCI_POLL_INTERVAL);
154
155 if (r == 1)
156 pr_notice("CMCI storm detected: switching to poll mode\n");
157 return true;
158}
159
56/* 160/*
57 * The interrupt handler. This is called on every event. 161 * The interrupt handler. This is called on every event.
58 * Just call the poller directly to log any events. 162 * Just call the poller directly to log any events.
@@ -61,33 +165,28 @@ static int cmci_supported(int *banks)
61 */ 165 */
62static void intel_threshold_interrupt(void) 166static void intel_threshold_interrupt(void)
63{ 167{
168 if (cmci_storm_detect())
169 return;
64 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); 170 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
65 mce_notify_irq(); 171 mce_notify_irq();
66} 172}
67 173
68static void print_update(char *type, int *hdr, int num)
69{
70 if (*hdr == 0)
71 printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
72 *hdr = 1;
73 printk(KERN_CONT " %s:%d", type, num);
74}
75
76/* 174/*
77 * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks 175 * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
78 * on this CPU. Use the algorithm recommended in the SDM to discover shared 176 * on this CPU. Use the algorithm recommended in the SDM to discover shared
79 * banks. 177 * banks.
80 */ 178 */
81static void cmci_discover(int banks, int boot) 179static void cmci_discover(int banks)
82{ 180{
83 unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); 181 unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
84 unsigned long flags; 182 unsigned long flags;
85 int hdr = 0;
86 int i; 183 int i;
184 int bios_wrong_thresh = 0;
87 185
88 raw_spin_lock_irqsave(&cmci_discover_lock, flags); 186 raw_spin_lock_irqsave(&cmci_discover_lock, flags);
89 for (i = 0; i < banks; i++) { 187 for (i = 0; i < banks; i++) {
90 u64 val; 188 u64 val;
189 int bios_zero_thresh = 0;
91 190
92 if (test_bit(i, owned)) 191 if (test_bit(i, owned))
93 continue; 192 continue;
@@ -96,29 +195,52 @@ static void cmci_discover(int banks, int boot)
96 195
97 /* Already owned by someone else? */ 196 /* Already owned by someone else? */
98 if (val & MCI_CTL2_CMCI_EN) { 197 if (val & MCI_CTL2_CMCI_EN) {
99 if (test_and_clear_bit(i, owned) && !boot) 198 clear_bit(i, owned);
100 print_update("SHD", &hdr, i);
101 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 199 __clear_bit(i, __get_cpu_var(mce_poll_banks));
102 continue; 200 continue;
103 } 201 }
104 202
105 val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; 203 if (!mce_bios_cmci_threshold) {
106 val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; 204 val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
205 val |= CMCI_THRESHOLD;
206 } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
207 /*
208 * If bios_cmci_threshold boot option was specified
209 * but the threshold is zero, we'll try to initialize
210 * it to 1.
211 */
212 bios_zero_thresh = 1;
213 val |= CMCI_THRESHOLD;
214 }
215
216 val |= MCI_CTL2_CMCI_EN;
107 wrmsrl(MSR_IA32_MCx_CTL2(i), val); 217 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
108 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 218 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
109 219
110 /* Did the enable bit stick? -- the bank supports CMCI */ 220 /* Did the enable bit stick? -- the bank supports CMCI */
111 if (val & MCI_CTL2_CMCI_EN) { 221 if (val & MCI_CTL2_CMCI_EN) {
112 if (!test_and_set_bit(i, owned) && !boot) 222 set_bit(i, owned);
113 print_update("CMCI", &hdr, i);
114 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 223 __clear_bit(i, __get_cpu_var(mce_poll_banks));
224 /*
225 * We are able to set thresholds for some banks that
226 * had a threshold of 0. This means the BIOS has not
227 * set the thresholds properly or does not work with
228 * this boot option. Note down now and report later.
229 */
230 if (mce_bios_cmci_threshold && bios_zero_thresh &&
231 (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
232 bios_wrong_thresh = 1;
115 } else { 233 } else {
116 WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); 234 WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
117 } 235 }
118 } 236 }
119 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); 237 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
120 if (hdr) 238 if (mce_bios_cmci_threshold && bios_wrong_thresh) {
121 printk(KERN_CONT "\n"); 239 pr_info_once(
240 "bios_cmci_threshold: Some banks do not have valid thresholds set\n");
241 pr_info_once(
242 "bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
243 }
122} 244}
123 245
124/* 246/*
@@ -156,7 +278,7 @@ void cmci_clear(void)
156 continue; 278 continue;
157 /* Disable CMCI */ 279 /* Disable CMCI */
158 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 280 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
159 val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); 281 val &= ~MCI_CTL2_CMCI_EN;
160 wrmsrl(MSR_IA32_MCx_CTL2(i), val); 282 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
161 __clear_bit(i, __get_cpu_var(mce_banks_owned)); 283 __clear_bit(i, __get_cpu_var(mce_banks_owned));
162 } 284 }
@@ -186,7 +308,7 @@ void cmci_rediscover(int dying)
186 continue; 308 continue;
187 /* Recheck banks in case CPUs don't all have the same */ 309 /* Recheck banks in case CPUs don't all have the same */
188 if (cmci_supported(&banks)) 310 if (cmci_supported(&banks))
189 cmci_discover(banks, 0); 311 cmci_discover(banks);
190 } 312 }
191 313
192 set_cpus_allowed_ptr(current, old); 314 set_cpus_allowed_ptr(current, old);
@@ -200,7 +322,7 @@ void cmci_reenable(void)
200{ 322{
201 int banks; 323 int banks;
202 if (cmci_supported(&banks)) 324 if (cmci_supported(&banks))
203 cmci_discover(banks, 0); 325 cmci_discover(banks);
204} 326}
205 327
206static void intel_init_cmci(void) 328static void intel_init_cmci(void)
@@ -211,7 +333,7 @@ static void intel_init_cmci(void)
211 return; 333 return;
212 334
213 mce_threshold_vector = intel_threshold_interrupt; 335 mce_threshold_vector = intel_threshold_interrupt;
214 cmci_discover(banks, 1); 336 cmci_discover(banks);
215 /* 337 /*
216 * For CPU #0 this runs with still disabled APIC, but that's 338 * For CPU #0 this runs with still disabled APIC, but that's
217 * ok because only the vector is set up. We still do another 339 * ok because only the vector is set up. We still do another
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
index c7b3fe2d72e0..091972ef49de 100644
--- a/arch/x86/kernel/cpu/mkcapflags.pl
+++ b/arch/x86/kernel/cpu/mkcapflags.pl
@@ -8,7 +8,10 @@
8open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n"; 8open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n";
9open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n"; 9open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
10 10
11print OUT "#include <asm/cpufeature.h>\n\n"; 11print OUT "#ifndef _ASM_X86_CPUFEATURE_H\n";
12print OUT "#include <asm/cpufeature.h>\n";
13print OUT "#endif\n";
14print OUT "\n";
12print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; 15print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
13 16
14%features = (); 17%features = ();
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 6605a81ba339..271d25700297 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -586,6 +586,8 @@ extern struct event_constraint intel_westmere_pebs_event_constraints[];
586 586
587extern struct event_constraint intel_snb_pebs_event_constraints[]; 587extern struct event_constraint intel_snb_pebs_event_constraints[];
588 588
589extern struct event_constraint intel_ivb_pebs_event_constraints[];
590
589struct event_constraint *intel_pebs_constraints(struct perf_event *event); 591struct event_constraint *intel_pebs_constraints(struct perf_event *event);
590 592
591void intel_pmu_pebs_enable(struct perf_event *event); 593void intel_pmu_pebs_enable(struct perf_event *event);
@@ -624,6 +626,8 @@ int p4_pmu_init(void);
624 626
625int p6_pmu_init(void); 627int p6_pmu_init(void);
626 628
629int knc_pmu_init(void);
630
627#else /* CONFIG_CPU_SUP_INTEL */ 631#else /* CONFIG_CPU_SUP_INTEL */
628 632
629static inline void reserve_ds_buffers(void) 633static inline void reserve_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 7bfb5bec8630..6336bcbd0618 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -41,17 +41,22 @@ struct cpu_perf_ibs {
41}; 41};
42 42
43struct perf_ibs { 43struct perf_ibs {
44 struct pmu pmu; 44 struct pmu pmu;
45 unsigned int msr; 45 unsigned int msr;
46 u64 config_mask; 46 u64 config_mask;
47 u64 cnt_mask; 47 u64 cnt_mask;
48 u64 enable_mask; 48 u64 enable_mask;
49 u64 valid_mask; 49 u64 valid_mask;
50 u64 max_period; 50 u64 max_period;
51 unsigned long offset_mask[1]; 51 unsigned long offset_mask[1];
52 int offset_max; 52 int offset_max;
53 struct cpu_perf_ibs __percpu *pcpu; 53 struct cpu_perf_ibs __percpu *pcpu;
54 u64 (*get_count)(u64 config); 54
55 struct attribute **format_attrs;
56 struct attribute_group format_group;
57 const struct attribute_group *attr_groups[2];
58
59 u64 (*get_count)(u64 config);
55}; 60};
56 61
57struct perf_ibs_data { 62struct perf_ibs_data {
@@ -209,6 +214,15 @@ static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
209 return -EOPNOTSUPP; 214 return -EOPNOTSUPP;
210} 215}
211 216
217static const struct perf_event_attr ibs_notsupp = {
218 .exclude_user = 1,
219 .exclude_kernel = 1,
220 .exclude_hv = 1,
221 .exclude_idle = 1,
222 .exclude_host = 1,
223 .exclude_guest = 1,
224};
225
212static int perf_ibs_init(struct perf_event *event) 226static int perf_ibs_init(struct perf_event *event)
213{ 227{
214 struct hw_perf_event *hwc = &event->hw; 228 struct hw_perf_event *hwc = &event->hw;
@@ -229,6 +243,9 @@ static int perf_ibs_init(struct perf_event *event)
229 if (event->pmu != &perf_ibs->pmu) 243 if (event->pmu != &perf_ibs->pmu)
230 return -ENOENT; 244 return -ENOENT;
231 245
246 if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
247 return -EINVAL;
248
232 if (config & ~perf_ibs->config_mask) 249 if (config & ~perf_ibs->config_mask)
233 return -EINVAL; 250 return -EINVAL;
234 251
@@ -434,6 +451,19 @@ static void perf_ibs_del(struct perf_event *event, int flags)
434 451
435static void perf_ibs_read(struct perf_event *event) { } 452static void perf_ibs_read(struct perf_event *event) { }
436 453
454PMU_FORMAT_ATTR(rand_en, "config:57");
455PMU_FORMAT_ATTR(cnt_ctl, "config:19");
456
457static struct attribute *ibs_fetch_format_attrs[] = {
458 &format_attr_rand_en.attr,
459 NULL,
460};
461
462static struct attribute *ibs_op_format_attrs[] = {
463 NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
464 NULL,
465};
466
437static struct perf_ibs perf_ibs_fetch = { 467static struct perf_ibs perf_ibs_fetch = {
438 .pmu = { 468 .pmu = {
439 .task_ctx_nr = perf_invalid_context, 469 .task_ctx_nr = perf_invalid_context,
@@ -453,6 +483,7 @@ static struct perf_ibs perf_ibs_fetch = {
453 .max_period = IBS_FETCH_MAX_CNT << 4, 483 .max_period = IBS_FETCH_MAX_CNT << 4,
454 .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, 484 .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
455 .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, 485 .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
486 .format_attrs = ibs_fetch_format_attrs,
456 487
457 .get_count = get_ibs_fetch_count, 488 .get_count = get_ibs_fetch_count,
458}; 489};
@@ -476,6 +507,7 @@ static struct perf_ibs perf_ibs_op = {
476 .max_period = IBS_OP_MAX_CNT << 4, 507 .max_period = IBS_OP_MAX_CNT << 4,
477 .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, 508 .offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
478 .offset_max = MSR_AMD64_IBSOP_REG_COUNT, 509 .offset_max = MSR_AMD64_IBSOP_REG_COUNT,
510 .format_attrs = ibs_op_format_attrs,
479 511
480 .get_count = get_ibs_op_count, 512 .get_count = get_ibs_op_count,
481}; 513};
@@ -585,6 +617,17 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
585 617
586 perf_ibs->pcpu = pcpu; 618 perf_ibs->pcpu = pcpu;
587 619
620 /* register attributes */
621 if (perf_ibs->format_attrs[0]) {
622 memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
623 perf_ibs->format_group.name = "format";
624 perf_ibs->format_group.attrs = perf_ibs->format_attrs;
625
626 memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
627 perf_ibs->attr_groups[0] = &perf_ibs->format_group;
628 perf_ibs->pmu.attr_groups = perf_ibs->attr_groups;
629 }
630
588 ret = perf_pmu_register(&perf_ibs->pmu, name, -1); 631 ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
589 if (ret) { 632 if (ret) {
590 perf_ibs->pcpu = NULL; 633 perf_ibs->pcpu = NULL;
@@ -596,13 +639,19 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
596 639
597static __init int perf_event_ibs_init(void) 640static __init int perf_event_ibs_init(void)
598{ 641{
642 struct attribute **attr = ibs_op_format_attrs;
643
599 if (!ibs_caps) 644 if (!ibs_caps)
600 return -ENODEV; /* ibs not supported by the cpu */ 645 return -ENODEV; /* ibs not supported by the cpu */
601 646
602 perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); 647 perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
603 if (ibs_caps & IBS_CAPS_OPCNT) 648
649 if (ibs_caps & IBS_CAPS_OPCNT) {
604 perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; 650 perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
651 *attr++ = &format_attr_cnt_ctl.attr;
652 }
605 perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); 653 perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
654
606 register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); 655 register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
607 printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); 656 printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
608 657
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 7f2739e03e79..324bb523d9d9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1906,6 +1906,8 @@ __init int intel_pmu_init(void)
1906 switch (boot_cpu_data.x86) { 1906 switch (boot_cpu_data.x86) {
1907 case 0x6: 1907 case 0x6:
1908 return p6_pmu_init(); 1908 return p6_pmu_init();
1909 case 0xb:
1910 return knc_pmu_init();
1909 case 0xf: 1911 case 0xf:
1910 return p4_pmu_init(); 1912 return p4_pmu_init();
1911 } 1913 }
@@ -2008,6 +2010,7 @@ __init int intel_pmu_init(void)
2008 break; 2010 break;
2009 2011
2010 case 28: /* Atom */ 2012 case 28: /* Atom */
2013 case 54: /* Cedariew */
2011 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, 2014 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
2012 sizeof(hw_cache_event_ids)); 2015 sizeof(hw_cache_event_ids));
2013 2016
@@ -2047,7 +2050,6 @@ __init int intel_pmu_init(void)
2047 case 42: /* SandyBridge */ 2050 case 42: /* SandyBridge */
2048 case 45: /* SandyBridge, "Romely-EP" */ 2051 case 45: /* SandyBridge, "Romely-EP" */
2049 x86_add_quirk(intel_sandybridge_quirk); 2052 x86_add_quirk(intel_sandybridge_quirk);
2050 case 58: /* IvyBridge */
2051 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 2053 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
2052 sizeof(hw_cache_event_ids)); 2054 sizeof(hw_cache_event_ids));
2053 memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, 2055 memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
@@ -2072,6 +2074,29 @@ __init int intel_pmu_init(void)
2072 2074
2073 pr_cont("SandyBridge events, "); 2075 pr_cont("SandyBridge events, ");
2074 break; 2076 break;
2077 case 58: /* IvyBridge */
2078 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
2079 sizeof(hw_cache_event_ids));
2080 memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
2081 sizeof(hw_cache_extra_regs));
2082
2083 intel_pmu_lbr_init_snb();
2084
2085 x86_pmu.event_constraints = intel_snb_event_constraints;
2086 x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
2087 x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
2088 x86_pmu.extra_regs = intel_snb_extra_regs;
2089 /* all extra regs are per-cpu when HT is on */
2090 x86_pmu.er_flags |= ERF_HAS_RSP_1;
2091 x86_pmu.er_flags |= ERF_NO_HT_SHARING;
2092
2093 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
2094 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
2095 X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
2096
2097 pr_cont("IvyBridge events, ");
2098 break;
2099
2075 2100
2076 default: 2101 default:
2077 switch (x86_pmu.version) { 2102 switch (x86_pmu.version) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index e38d97bf4259..826054a4f2ee 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -407,6 +407,20 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
407 EVENT_CONSTRAINT_END 407 EVENT_CONSTRAINT_END
408}; 408};
409 409
410struct event_constraint intel_ivb_pebs_event_constraints[] = {
411 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
412 INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
413 INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
414 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
415 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
416 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */
417 INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
418 INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
419 INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
420 INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
421 EVENT_CONSTRAINT_END
422};
423
410struct event_constraint *intel_pebs_constraints(struct perf_event *event) 424struct event_constraint *intel_pebs_constraints(struct perf_event *event)
411{ 425{
412 struct event_constraint *c; 426 struct event_constraint *c;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 520b4265fcd2..da02e9cc3754 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -686,7 +686,8 @@ void intel_pmu_lbr_init_atom(void)
686 * to have an operational LBR which can freeze 686 * to have an operational LBR which can freeze
687 * on PMU interrupt 687 * on PMU interrupt
688 */ 688 */
689 if (boot_cpu_data.x86_mask < 10) { 689 if (boot_cpu_data.x86_model == 28
690 && boot_cpu_data.x86_mask < 10) {
690 pr_cont("LBR disabled due to erratum"); 691 pr_cont("LBR disabled due to erratum");
691 return; 692 return;
692 } 693 }
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 0a5571080e74..99d96a4978b5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -661,6 +661,11 @@ static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
661 } 661 }
662} 662}
663 663
664static struct uncore_event_desc snb_uncore_events[] = {
665 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
666 { /* end: all zeroes */ },
667};
668
664static struct attribute *snb_uncore_formats_attr[] = { 669static struct attribute *snb_uncore_formats_attr[] = {
665 &format_attr_event.attr, 670 &format_attr_event.attr,
666 &format_attr_umask.attr, 671 &format_attr_umask.attr,
@@ -704,6 +709,7 @@ static struct intel_uncore_type snb_uncore_cbox = {
704 .constraints = snb_uncore_cbox_constraints, 709 .constraints = snb_uncore_cbox_constraints,
705 .ops = &snb_uncore_msr_ops, 710 .ops = &snb_uncore_msr_ops,
706 .format_group = &snb_uncore_format_group, 711 .format_group = &snb_uncore_format_group,
712 .event_descs = snb_uncore_events,
707}; 713};
708 714
709static struct intel_uncore_type *snb_msr_uncores[] = { 715static struct intel_uncore_type *snb_msr_uncores[] = {
@@ -1944,7 +1950,7 @@ struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cp
1944static struct intel_uncore_box * 1950static struct intel_uncore_box *
1945uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) 1951uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
1946{ 1952{
1947 static struct intel_uncore_box *box; 1953 struct intel_uncore_box *box;
1948 1954
1949 box = *per_cpu_ptr(pmu->box, cpu); 1955 box = *per_cpu_ptr(pmu->box, cpu);
1950 if (box) 1956 if (box)
@@ -2341,6 +2347,27 @@ int uncore_pmu_event_init(struct perf_event *event)
2341 return ret; 2347 return ret;
2342} 2348}
2343 2349
2350static ssize_t uncore_get_attr_cpumask(struct device *dev,
2351 struct device_attribute *attr, char *buf)
2352{
2353 int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask);
2354
2355 buf[n++] = '\n';
2356 buf[n] = '\0';
2357 return n;
2358}
2359
2360static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
2361
2362static struct attribute *uncore_pmu_attrs[] = {
2363 &dev_attr_cpumask.attr,
2364 NULL,
2365};
2366
2367static struct attribute_group uncore_pmu_attr_group = {
2368 .attrs = uncore_pmu_attrs,
2369};
2370
2344static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu) 2371static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
2345{ 2372{
2346 int ret; 2373 int ret;
@@ -2378,8 +2405,8 @@ static void __init uncore_type_exit(struct intel_uncore_type *type)
2378 free_percpu(type->pmus[i].box); 2405 free_percpu(type->pmus[i].box);
2379 kfree(type->pmus); 2406 kfree(type->pmus);
2380 type->pmus = NULL; 2407 type->pmus = NULL;
2381 kfree(type->attr_groups[1]); 2408 kfree(type->events_group);
2382 type->attr_groups[1] = NULL; 2409 type->events_group = NULL;
2383} 2410}
2384 2411
2385static void __init uncore_types_exit(struct intel_uncore_type **types) 2412static void __init uncore_types_exit(struct intel_uncore_type **types)
@@ -2431,9 +2458,10 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
2431 for (j = 0; j < i; j++) 2458 for (j = 0; j < i; j++)
2432 attrs[j] = &type->event_descs[j].attr.attr; 2459 attrs[j] = &type->event_descs[j].attr.attr;
2433 2460
2434 type->attr_groups[1] = events_group; 2461 type->events_group = events_group;
2435 } 2462 }
2436 2463
2464 type->pmu_group = &uncore_pmu_attr_group;
2437 type->pmus = pmus; 2465 type->pmus = pmus;
2438 return 0; 2466 return 0;
2439fail: 2467fail:
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 5b81c1856aac..e68a4550e952 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -369,10 +369,12 @@ struct intel_uncore_type {
369 struct intel_uncore_pmu *pmus; 369 struct intel_uncore_pmu *pmus;
370 struct intel_uncore_ops *ops; 370 struct intel_uncore_ops *ops;
371 struct uncore_event_desc *event_descs; 371 struct uncore_event_desc *event_descs;
372 const struct attribute_group *attr_groups[3]; 372 const struct attribute_group *attr_groups[4];
373}; 373};
374 374
375#define format_group attr_groups[0] 375#define pmu_group attr_groups[0]
376#define format_group attr_groups[1]
377#define events_group attr_groups[2]
376 378
377struct intel_uncore_ops { 379struct intel_uncore_ops {
378 void (*init_box)(struct intel_uncore_box *); 380 void (*init_box)(struct intel_uncore_box *);
diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c
new file mode 100644
index 000000000000..7c46bfdbc373
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_knc.c
@@ -0,0 +1,248 @@
1/* Driver for Intel Xeon Phi "Knights Corner" PMU */
2
3#include <linux/perf_event.h>
4#include <linux/types.h>
5
6#include "perf_event.h"
7
8static const u64 knc_perfmon_event_map[] =
9{
10 [PERF_COUNT_HW_CPU_CYCLES] = 0x002a,
11 [PERF_COUNT_HW_INSTRUCTIONS] = 0x0016,
12 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0028,
13 [PERF_COUNT_HW_CACHE_MISSES] = 0x0029,
14 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0012,
15 [PERF_COUNT_HW_BRANCH_MISSES] = 0x002b,
16};
17
18static __initconst u64 knc_hw_cache_event_ids
19 [PERF_COUNT_HW_CACHE_MAX]
20 [PERF_COUNT_HW_CACHE_OP_MAX]
21 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
22{
23 [ C(L1D) ] = {
24 [ C(OP_READ) ] = {
25 /* On Xeon Phi event "0" is a valid DATA_READ */
26 /* (L1 Data Cache Reads) Instruction. */
27 /* We code this as ARCH_PERFMON_EVENTSEL_INT as this */
28 /* bit will always be set in x86_pmu_hw_config(). */
29 [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
30 /* DATA_READ */
31 [ C(RESULT_MISS) ] = 0x0003, /* DATA_READ_MISS */
32 },
33 [ C(OP_WRITE) ] = {
34 [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
35 [ C(RESULT_MISS) ] = 0x0004, /* DATA_WRITE_MISS */
36 },
37 [ C(OP_PREFETCH) ] = {
38 [ C(RESULT_ACCESS) ] = 0x0011, /* L1_DATA_PF1 */
39 [ C(RESULT_MISS) ] = 0x001c, /* L1_DATA_PF1_MISS */
40 },
41 },
42 [ C(L1I ) ] = {
43 [ C(OP_READ) ] = {
44 [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
45 [ C(RESULT_MISS) ] = 0x000e, /* CODE_CACHE_MISS */
46 },
47 [ C(OP_WRITE) ] = {
48 [ C(RESULT_ACCESS) ] = -1,
49 [ C(RESULT_MISS) ] = -1,
50 },
51 [ C(OP_PREFETCH) ] = {
52 [ C(RESULT_ACCESS) ] = 0x0,
53 [ C(RESULT_MISS) ] = 0x0,
54 },
55 },
56 [ C(LL ) ] = {
57 [ C(OP_READ) ] = {
58 [ C(RESULT_ACCESS) ] = 0,
59 [ C(RESULT_MISS) ] = 0x10cb, /* L2_READ_MISS */
60 },
61 [ C(OP_WRITE) ] = {
62 [ C(RESULT_ACCESS) ] = 0x10cc, /* L2_WRITE_HIT */
63 [ C(RESULT_MISS) ] = 0,
64 },
65 [ C(OP_PREFETCH) ] = {
66 [ C(RESULT_ACCESS) ] = 0x10fc, /* L2_DATA_PF2 */
67 [ C(RESULT_MISS) ] = 0x10fe, /* L2_DATA_PF2_MISS */
68 },
69 },
70 [ C(DTLB) ] = {
71 [ C(OP_READ) ] = {
72 [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
73 /* DATA_READ */
74 /* see note on L1 OP_READ */
75 [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
76 },
77 [ C(OP_WRITE) ] = {
78 [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
79 [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
80 },
81 [ C(OP_PREFETCH) ] = {
82 [ C(RESULT_ACCESS) ] = 0x0,
83 [ C(RESULT_MISS) ] = 0x0,
84 },
85 },
86 [ C(ITLB) ] = {
87 [ C(OP_READ) ] = {
88 [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
89 [ C(RESULT_MISS) ] = 0x000d, /* CODE_PAGE_WALK */
90 },
91 [ C(OP_WRITE) ] = {
92 [ C(RESULT_ACCESS) ] = -1,
93 [ C(RESULT_MISS) ] = -1,
94 },
95 [ C(OP_PREFETCH) ] = {
96 [ C(RESULT_ACCESS) ] = -1,
97 [ C(RESULT_MISS) ] = -1,
98 },
99 },
100 [ C(BPU ) ] = {
101 [ C(OP_READ) ] = {
102 [ C(RESULT_ACCESS) ] = 0x0012, /* BRANCHES */
103 [ C(RESULT_MISS) ] = 0x002b, /* BRANCHES_MISPREDICTED */
104 },
105 [ C(OP_WRITE) ] = {
106 [ C(RESULT_ACCESS) ] = -1,
107 [ C(RESULT_MISS) ] = -1,
108 },
109 [ C(OP_PREFETCH) ] = {
110 [ C(RESULT_ACCESS) ] = -1,
111 [ C(RESULT_MISS) ] = -1,
112 },
113 },
114};
115
116
117static u64 knc_pmu_event_map(int hw_event)
118{
119 return knc_perfmon_event_map[hw_event];
120}
121
122static struct event_constraint knc_event_constraints[] =
123{
124 INTEL_EVENT_CONSTRAINT(0xc3, 0x1), /* HWP_L2HIT */
125 INTEL_EVENT_CONSTRAINT(0xc4, 0x1), /* HWP_L2MISS */
126 INTEL_EVENT_CONSTRAINT(0xc8, 0x1), /* L2_READ_HIT_E */
127 INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* L2_READ_HIT_M */
128 INTEL_EVENT_CONSTRAINT(0xca, 0x1), /* L2_READ_HIT_S */
129 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* L2_READ_MISS */
130 INTEL_EVENT_CONSTRAINT(0xcc, 0x1), /* L2_WRITE_HIT */
131 INTEL_EVENT_CONSTRAINT(0xce, 0x1), /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
132 INTEL_EVENT_CONSTRAINT(0xcf, 0x1), /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
133 INTEL_EVENT_CONSTRAINT(0xd7, 0x1), /* L2_VICTIM_REQ_WITH_DATA */
134 INTEL_EVENT_CONSTRAINT(0xe3, 0x1), /* SNP_HITM_BUNIT */
135 INTEL_EVENT_CONSTRAINT(0xe6, 0x1), /* SNP_HIT_L2 */
136 INTEL_EVENT_CONSTRAINT(0xe7, 0x1), /* SNP_HITM_L2 */
137 INTEL_EVENT_CONSTRAINT(0xf1, 0x1), /* L2_DATA_READ_MISS_CACHE_FILL */
138 INTEL_EVENT_CONSTRAINT(0xf2, 0x1), /* L2_DATA_WRITE_MISS_CACHE_FILL */
139 INTEL_EVENT_CONSTRAINT(0xf6, 0x1), /* L2_DATA_READ_MISS_MEM_FILL */
140 INTEL_EVENT_CONSTRAINT(0xf7, 0x1), /* L2_DATA_WRITE_MISS_MEM_FILL */
141 INTEL_EVENT_CONSTRAINT(0xfc, 0x1), /* L2_DATA_PF2 */
142 INTEL_EVENT_CONSTRAINT(0xfd, 0x1), /* L2_DATA_PF2_DROP */
143 INTEL_EVENT_CONSTRAINT(0xfe, 0x1), /* L2_DATA_PF2_MISS */
144 INTEL_EVENT_CONSTRAINT(0xff, 0x1), /* L2_DATA_HIT_INFLIGHT_PF2 */
145 EVENT_CONSTRAINT_END
146};
147
148#define MSR_KNC_IA32_PERF_GLOBAL_STATUS 0x0000002d
149#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL 0x0000002e
150#define MSR_KNC_IA32_PERF_GLOBAL_CTRL 0x0000002f
151
152#define KNC_ENABLE_COUNTER0 0x00000001
153#define KNC_ENABLE_COUNTER1 0x00000002
154
155static void knc_pmu_disable_all(void)
156{
157 u64 val;
158
159 rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
160 val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
161 wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
162}
163
164static void knc_pmu_enable_all(int added)
165{
166 u64 val;
167
168 rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
169 val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
170 wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
171}
172
173static inline void
174knc_pmu_disable_event(struct perf_event *event)
175{
176 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
177 struct hw_perf_event *hwc = &event->hw;
178 u64 val;
179
180 val = hwc->config;
181 if (cpuc->enabled)
182 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
183
184 (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
185}
186
187static void knc_pmu_enable_event(struct perf_event *event)
188{
189 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
190 struct hw_perf_event *hwc = &event->hw;
191 u64 val;
192
193 val = hwc->config;
194 if (cpuc->enabled)
195 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
196
197 (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
198}
199
200PMU_FORMAT_ATTR(event, "config:0-7" );
201PMU_FORMAT_ATTR(umask, "config:8-15" );
202PMU_FORMAT_ATTR(edge, "config:18" );
203PMU_FORMAT_ATTR(inv, "config:23" );
204PMU_FORMAT_ATTR(cmask, "config:24-31" );
205
206static struct attribute *intel_knc_formats_attr[] = {
207 &format_attr_event.attr,
208 &format_attr_umask.attr,
209 &format_attr_edge.attr,
210 &format_attr_inv.attr,
211 &format_attr_cmask.attr,
212 NULL,
213};
214
215static __initconst struct x86_pmu knc_pmu = {
216 .name = "knc",
217 .handle_irq = x86_pmu_handle_irq,
218 .disable_all = knc_pmu_disable_all,
219 .enable_all = knc_pmu_enable_all,
220 .enable = knc_pmu_enable_event,
221 .disable = knc_pmu_disable_event,
222 .hw_config = x86_pmu_hw_config,
223 .schedule_events = x86_schedule_events,
224 .eventsel = MSR_KNC_EVNTSEL0,
225 .perfctr = MSR_KNC_PERFCTR0,
226 .event_map = knc_pmu_event_map,
227 .max_events = ARRAY_SIZE(knc_perfmon_event_map),
228 .apic = 1,
229 .max_period = (1ULL << 31) - 1,
230 .version = 0,
231 .num_counters = 2,
232 /* in theory 40 bits, early silicon is buggy though */
233 .cntval_bits = 32,
234 .cntval_mask = (1ULL << 32) - 1,
235 .get_event_constraints = x86_get_event_constraints,
236 .event_constraints = knc_event_constraints,
237 .format_attrs = intel_knc_formats_attr,
238};
239
240__init int knc_pmu_init(void)
241{
242 x86_pmu = knc_pmu;
243
244 memcpy(hw_cache_event_ids, knc_hw_cache_event_ids,
245 sizeof(hw_cache_event_ids));
246
247 return 0;
248}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 966512b2cacf..2e8caf03f593 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -56,6 +56,8 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
56 switch (boot_cpu_data.x86) { 56 switch (boot_cpu_data.x86) {
57 case 6: 57 case 6:
58 return msr - MSR_P6_PERFCTR0; 58 return msr - MSR_P6_PERFCTR0;
59 case 11:
60 return msr - MSR_KNC_PERFCTR0;
59 case 15: 61 case 15:
60 return msr - MSR_P4_BPU_PERFCTR0; 62 return msr - MSR_P4_BPU_PERFCTR0;
61 } 63 }
@@ -82,6 +84,8 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
82 switch (boot_cpu_data.x86) { 84 switch (boot_cpu_data.x86) {
83 case 6: 85 case 6:
84 return msr - MSR_P6_EVNTSEL0; 86 return msr - MSR_P6_EVNTSEL0;
87 case 11:
88 return msr - MSR_KNC_EVNTSEL0;
85 case 15: 89 case 15:
86 return msr - MSR_P4_BSU_ESCR0; 90 return msr - MSR_P4_BSU_ESCR0;
87 } 91 }
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 8022c6681485..fbd895562292 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -140,10 +140,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
140 140
141static void *c_start(struct seq_file *m, loff_t *pos) 141static void *c_start(struct seq_file *m, loff_t *pos)
142{ 142{
143 if (*pos == 0) /* just in case, cpu 0 is not the first */ 143 *pos = cpumask_next(*pos - 1, cpu_online_mask);
144 *pos = cpumask_first(cpu_online_mask);
145 else
146 *pos = cpumask_next(*pos - 1, cpu_online_mask);
147 if ((*pos) < nr_cpu_ids) 144 if ((*pos) < nr_cpu_ids)
148 return &cpu_data(*pos); 145 return &cpu_data(*pos);
149 return NULL; 146 return NULL;
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 39472dd2323f..60c78917190c 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -199,12 +199,14 @@ static int __init cpuid_init(void)
199 goto out_chrdev; 199 goto out_chrdev;
200 } 200 }
201 cpuid_class->devnode = cpuid_devnode; 201 cpuid_class->devnode = cpuid_devnode;
202 get_online_cpus();
202 for_each_online_cpu(i) { 203 for_each_online_cpu(i) {
203 err = cpuid_device_create(i); 204 err = cpuid_device_create(i);
204 if (err != 0) 205 if (err != 0)
205 goto out_class; 206 goto out_class;
206 } 207 }
207 register_hotcpu_notifier(&cpuid_class_cpu_notifier); 208 register_hotcpu_notifier(&cpuid_class_cpu_notifier);
209 put_online_cpus();
208 210
209 err = 0; 211 err = 0;
210 goto out; 212 goto out;
@@ -214,6 +216,7 @@ out_class:
214 for_each_online_cpu(i) { 216 for_each_online_cpu(i) {
215 cpuid_device_destroy(i); 217 cpuid_device_destroy(i);
216 } 218 }
219 put_online_cpus();
217 class_destroy(cpuid_class); 220 class_destroy(cpuid_class);
218out_chrdev: 221out_chrdev:
219 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); 222 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
@@ -225,11 +228,13 @@ static void __exit cpuid_exit(void)
225{ 228{
226 int cpu = 0; 229 int cpu = 0;
227 230
231 get_online_cpus();
228 for_each_online_cpu(cpu) 232 for_each_online_cpu(cpu)
229 cpuid_device_destroy(cpu); 233 cpuid_device_destroy(cpu);
230 class_destroy(cpuid_class); 234 class_destroy(cpuid_class);
231 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); 235 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
232 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); 236 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
237 put_online_cpus();
233} 238}
234 239
235module_init(cpuid_init); 240module_init(cpuid_init);
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 3ae2ced4a874..b1581527a236 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -342,6 +342,47 @@ const struct irq_domain_ops ioapic_irq_domain_ops = {
342 .xlate = ioapic_xlate, 342 .xlate = ioapic_xlate,
343}; 343};
344 344
345static void dt_add_ioapic_domain(unsigned int ioapic_num,
346 struct device_node *np)
347{
348 struct irq_domain *id;
349 struct mp_ioapic_gsi *gsi_cfg;
350 int ret;
351 int num;
352
353 gsi_cfg = mp_ioapic_gsi_routing(ioapic_num);
354 num = gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
355
356 id = irq_domain_add_linear(np, num, &ioapic_irq_domain_ops,
357 (void *)ioapic_num);
358 BUG_ON(!id);
359 if (gsi_cfg->gsi_base == 0) {
360 /*
361 * The first NR_IRQS_LEGACY irq descs are allocated in
362 * early_irq_init() and need just a mapping. The
363 * remaining irqs need both. All of them are preallocated
364 * and assigned so we can keep the 1:1 mapping which the ioapic
365 * is having.
366 */
367 ret = irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY);
368 if (ret)
369 pr_err("Error mapping legacy IRQs: %d\n", ret);
370
371 if (num > NR_IRQS_LEGACY) {
372 ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY,
373 NR_IRQS_LEGACY, num - NR_IRQS_LEGACY);
374 if (ret)
375 pr_err("Error creating mapping for the "
376 "remaining IRQs: %d\n", ret);
377 }
378 irq_set_default_host(id);
379 } else {
380 ret = irq_create_strict_mappings(id, gsi_cfg->gsi_base, 0, num);
381 if (ret)
382 pr_err("Error creating IRQ mapping: %d\n", ret);
383 }
384}
385
345static void __init ioapic_add_ofnode(struct device_node *np) 386static void __init ioapic_add_ofnode(struct device_node *np)
346{ 387{
347 struct resource r; 388 struct resource r;
@@ -356,15 +397,7 @@ static void __init ioapic_add_ofnode(struct device_node *np)
356 397
357 for (i = 0; i < nr_ioapics; i++) { 398 for (i = 0; i < nr_ioapics; i++) {
358 if (r.start == mpc_ioapic_addr(i)) { 399 if (r.start == mpc_ioapic_addr(i)) {
359 struct irq_domain *id; 400 dt_add_ioapic_domain(i, np);
360 struct mp_ioapic_gsi *gsi_cfg;
361
362 gsi_cfg = mp_ioapic_gsi_routing(i);
363
364 id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0,
365 &ioapic_irq_domain_ops,
366 (void*)i);
367 BUG_ON(!id);
368 return; 401 return;
369 } 402 }
370 } 403 }
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 8f8e8eea9085..88b725aa1d52 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -57,6 +57,7 @@
57#include <asm/cpufeature.h> 57#include <asm/cpufeature.h>
58#include <asm/alternative-asm.h> 58#include <asm/alternative-asm.h>
59#include <asm/asm.h> 59#include <asm/asm.h>
60#include <asm/smap.h>
60 61
61/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 62/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
62#include <linux/elf-em.h> 63#include <linux/elf-em.h>
@@ -298,6 +299,21 @@ ENTRY(ret_from_fork)
298 CFI_ENDPROC 299 CFI_ENDPROC
299END(ret_from_fork) 300END(ret_from_fork)
300 301
302ENTRY(ret_from_kernel_thread)
303 CFI_STARTPROC
304 pushl_cfi %eax
305 call schedule_tail
306 GET_THREAD_INFO(%ebp)
307 popl_cfi %eax
308 pushl_cfi $0x0202 # Reset kernel eflags
309 popfl_cfi
310 movl PT_EBP(%esp),%eax
311 call *PT_EBX(%esp)
312 movl $0,PT_EAX(%esp)
313 jmp syscall_exit
314 CFI_ENDPROC
315ENDPROC(ret_from_kernel_thread)
316
301/* 317/*
302 * Interrupt exit functions should be protected against kprobes 318 * Interrupt exit functions should be protected against kprobes
303 */ 319 */
@@ -322,8 +338,7 @@ ret_from_intr:
322 andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax 338 andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
323#else 339#else
324 /* 340 /*
325 * We can be coming here from a syscall done in the kernel space, 341 * We can be coming here from child spawned by kernel_thread().
326 * e.g. a failed kernel_execve().
327 */ 342 */
328 movl PT_CS(%esp), %eax 343 movl PT_CS(%esp), %eax
329 andl $SEGMENT_RPL_MASK, %eax 344 andl $SEGMENT_RPL_MASK, %eax
@@ -407,7 +422,9 @@ sysenter_past_esp:
407 */ 422 */
408 cmpl $__PAGE_OFFSET-3,%ebp 423 cmpl $__PAGE_OFFSET-3,%ebp
409 jae syscall_fault 424 jae syscall_fault
425 ASM_STAC
4101: movl (%ebp),%ebp 4261: movl (%ebp),%ebp
427 ASM_CLAC
411 movl %ebp,PT_EBP(%esp) 428 movl %ebp,PT_EBP(%esp)
412 _ASM_EXTABLE(1b,syscall_fault) 429 _ASM_EXTABLE(1b,syscall_fault)
413 430
@@ -488,6 +505,7 @@ ENDPROC(ia32_sysenter_target)
488 # system call handler stub 505 # system call handler stub
489ENTRY(system_call) 506ENTRY(system_call)
490 RING0_INT_FRAME # can't unwind into user space anyway 507 RING0_INT_FRAME # can't unwind into user space anyway
508 ASM_CLAC
491 pushl_cfi %eax # save orig_eax 509 pushl_cfi %eax # save orig_eax
492 SAVE_ALL 510 SAVE_ALL
493 GET_THREAD_INFO(%ebp) 511 GET_THREAD_INFO(%ebp)
@@ -612,6 +630,10 @@ work_notifysig: # deal with pending signals and
612 movl %esp, %eax 630 movl %esp, %eax
613 jne work_notifysig_v86 # returning to kernel-space or 631 jne work_notifysig_v86 # returning to kernel-space or
614 # vm86-space 632 # vm86-space
6331:
634#else
635 movl %esp, %eax
636#endif
615 TRACE_IRQS_ON 637 TRACE_IRQS_ON
616 ENABLE_INTERRUPTS(CLBR_NONE) 638 ENABLE_INTERRUPTS(CLBR_NONE)
617 movb PT_CS(%esp), %bl 639 movb PT_CS(%esp), %bl
@@ -622,24 +644,15 @@ work_notifysig: # deal with pending signals and
622 call do_notify_resume 644 call do_notify_resume
623 jmp resume_userspace 645 jmp resume_userspace
624 646
647#ifdef CONFIG_VM86
625 ALIGN 648 ALIGN
626work_notifysig_v86: 649work_notifysig_v86:
627 pushl_cfi %ecx # save ti_flags for do_notify_resume 650 pushl_cfi %ecx # save ti_flags for do_notify_resume
628 call save_v86_state # %eax contains pt_regs pointer 651 call save_v86_state # %eax contains pt_regs pointer
629 popl_cfi %ecx 652 popl_cfi %ecx
630 movl %eax, %esp 653 movl %eax, %esp
631#else 654 jmp 1b
632 movl %esp, %eax
633#endif 655#endif
634 TRACE_IRQS_ON
635 ENABLE_INTERRUPTS(CLBR_NONE)
636 movb PT_CS(%esp), %bl
637 andb $SEGMENT_RPL_MASK, %bl
638 cmpb $USER_RPL, %bl
639 jb resume_kernel
640 xorl %edx, %edx
641 call do_notify_resume
642 jmp resume_userspace
643END(work_pending) 656END(work_pending)
644 657
645 # perform syscall exit tracing 658 # perform syscall exit tracing
@@ -670,6 +683,7 @@ END(syscall_exit_work)
670 683
671 RING0_INT_FRAME # can't unwind into user space anyway 684 RING0_INT_FRAME # can't unwind into user space anyway
672syscall_fault: 685syscall_fault:
686 ASM_CLAC
673 GET_THREAD_INFO(%ebp) 687 GET_THREAD_INFO(%ebp)
674 movl $-EFAULT,PT_EAX(%esp) 688 movl $-EFAULT,PT_EAX(%esp)
675 jmp resume_userspace 689 jmp resume_userspace
@@ -727,7 +741,6 @@ ENDPROC(ptregs_##name)
727PTREGSCALL1(iopl) 741PTREGSCALL1(iopl)
728PTREGSCALL0(fork) 742PTREGSCALL0(fork)
729PTREGSCALL0(vfork) 743PTREGSCALL0(vfork)
730PTREGSCALL3(execve)
731PTREGSCALL2(sigaltstack) 744PTREGSCALL2(sigaltstack)
732PTREGSCALL0(sigreturn) 745PTREGSCALL0(sigreturn)
733PTREGSCALL0(rt_sigreturn) 746PTREGSCALL0(rt_sigreturn)
@@ -825,6 +838,7 @@ END(interrupt)
825 */ 838 */
826 .p2align CONFIG_X86_L1_CACHE_SHIFT 839 .p2align CONFIG_X86_L1_CACHE_SHIFT
827common_interrupt: 840common_interrupt:
841 ASM_CLAC
828 addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ 842 addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
829 SAVE_ALL 843 SAVE_ALL
830 TRACE_IRQS_OFF 844 TRACE_IRQS_OFF
@@ -841,6 +855,7 @@ ENDPROC(common_interrupt)
841#define BUILD_INTERRUPT3(name, nr, fn) \ 855#define BUILD_INTERRUPT3(name, nr, fn) \
842ENTRY(name) \ 856ENTRY(name) \
843 RING0_INT_FRAME; \ 857 RING0_INT_FRAME; \
858 ASM_CLAC; \
844 pushl_cfi $~(nr); \ 859 pushl_cfi $~(nr); \
845 SAVE_ALL; \ 860 SAVE_ALL; \
846 TRACE_IRQS_OFF \ 861 TRACE_IRQS_OFF \
@@ -857,6 +872,7 @@ ENDPROC(name)
857 872
858ENTRY(coprocessor_error) 873ENTRY(coprocessor_error)
859 RING0_INT_FRAME 874 RING0_INT_FRAME
875 ASM_CLAC
860 pushl_cfi $0 876 pushl_cfi $0
861 pushl_cfi $do_coprocessor_error 877 pushl_cfi $do_coprocessor_error
862 jmp error_code 878 jmp error_code
@@ -865,6 +881,7 @@ END(coprocessor_error)
865 881
866ENTRY(simd_coprocessor_error) 882ENTRY(simd_coprocessor_error)
867 RING0_INT_FRAME 883 RING0_INT_FRAME
884 ASM_CLAC
868 pushl_cfi $0 885 pushl_cfi $0
869#ifdef CONFIG_X86_INVD_BUG 886#ifdef CONFIG_X86_INVD_BUG
870 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ 887 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
@@ -886,6 +903,7 @@ END(simd_coprocessor_error)
886 903
887ENTRY(device_not_available) 904ENTRY(device_not_available)
888 RING0_INT_FRAME 905 RING0_INT_FRAME
906 ASM_CLAC
889 pushl_cfi $-1 # mark this as an int 907 pushl_cfi $-1 # mark this as an int
890 pushl_cfi $do_device_not_available 908 pushl_cfi $do_device_not_available
891 jmp error_code 909 jmp error_code
@@ -906,6 +924,7 @@ END(native_irq_enable_sysexit)
906 924
907ENTRY(overflow) 925ENTRY(overflow)
908 RING0_INT_FRAME 926 RING0_INT_FRAME
927 ASM_CLAC
909 pushl_cfi $0 928 pushl_cfi $0
910 pushl_cfi $do_overflow 929 pushl_cfi $do_overflow
911 jmp error_code 930 jmp error_code
@@ -914,6 +933,7 @@ END(overflow)
914 933
915ENTRY(bounds) 934ENTRY(bounds)
916 RING0_INT_FRAME 935 RING0_INT_FRAME
936 ASM_CLAC
917 pushl_cfi $0 937 pushl_cfi $0
918 pushl_cfi $do_bounds 938 pushl_cfi $do_bounds
919 jmp error_code 939 jmp error_code
@@ -922,6 +942,7 @@ END(bounds)
922 942
923ENTRY(invalid_op) 943ENTRY(invalid_op)
924 RING0_INT_FRAME 944 RING0_INT_FRAME
945 ASM_CLAC
925 pushl_cfi $0 946 pushl_cfi $0
926 pushl_cfi $do_invalid_op 947 pushl_cfi $do_invalid_op
927 jmp error_code 948 jmp error_code
@@ -930,6 +951,7 @@ END(invalid_op)
930 951
931ENTRY(coprocessor_segment_overrun) 952ENTRY(coprocessor_segment_overrun)
932 RING0_INT_FRAME 953 RING0_INT_FRAME
954 ASM_CLAC
933 pushl_cfi $0 955 pushl_cfi $0
934 pushl_cfi $do_coprocessor_segment_overrun 956 pushl_cfi $do_coprocessor_segment_overrun
935 jmp error_code 957 jmp error_code
@@ -938,6 +960,7 @@ END(coprocessor_segment_overrun)
938 960
939ENTRY(invalid_TSS) 961ENTRY(invalid_TSS)
940 RING0_EC_FRAME 962 RING0_EC_FRAME
963 ASM_CLAC
941 pushl_cfi $do_invalid_TSS 964 pushl_cfi $do_invalid_TSS
942 jmp error_code 965 jmp error_code
943 CFI_ENDPROC 966 CFI_ENDPROC
@@ -945,6 +968,7 @@ END(invalid_TSS)
945 968
946ENTRY(segment_not_present) 969ENTRY(segment_not_present)
947 RING0_EC_FRAME 970 RING0_EC_FRAME
971 ASM_CLAC
948 pushl_cfi $do_segment_not_present 972 pushl_cfi $do_segment_not_present
949 jmp error_code 973 jmp error_code
950 CFI_ENDPROC 974 CFI_ENDPROC
@@ -952,6 +976,7 @@ END(segment_not_present)
952 976
953ENTRY(stack_segment) 977ENTRY(stack_segment)
954 RING0_EC_FRAME 978 RING0_EC_FRAME
979 ASM_CLAC
955 pushl_cfi $do_stack_segment 980 pushl_cfi $do_stack_segment
956 jmp error_code 981 jmp error_code
957 CFI_ENDPROC 982 CFI_ENDPROC
@@ -959,6 +984,7 @@ END(stack_segment)
959 984
960ENTRY(alignment_check) 985ENTRY(alignment_check)
961 RING0_EC_FRAME 986 RING0_EC_FRAME
987 ASM_CLAC
962 pushl_cfi $do_alignment_check 988 pushl_cfi $do_alignment_check
963 jmp error_code 989 jmp error_code
964 CFI_ENDPROC 990 CFI_ENDPROC
@@ -966,6 +992,7 @@ END(alignment_check)
966 992
967ENTRY(divide_error) 993ENTRY(divide_error)
968 RING0_INT_FRAME 994 RING0_INT_FRAME
995 ASM_CLAC
969 pushl_cfi $0 # no error code 996 pushl_cfi $0 # no error code
970 pushl_cfi $do_divide_error 997 pushl_cfi $do_divide_error
971 jmp error_code 998 jmp error_code
@@ -975,6 +1002,7 @@ END(divide_error)
975#ifdef CONFIG_X86_MCE 1002#ifdef CONFIG_X86_MCE
976ENTRY(machine_check) 1003ENTRY(machine_check)
977 RING0_INT_FRAME 1004 RING0_INT_FRAME
1005 ASM_CLAC
978 pushl_cfi $0 1006 pushl_cfi $0
979 pushl_cfi machine_check_vector 1007 pushl_cfi machine_check_vector
980 jmp error_code 1008 jmp error_code
@@ -984,6 +1012,7 @@ END(machine_check)
984 1012
985ENTRY(spurious_interrupt_bug) 1013ENTRY(spurious_interrupt_bug)
986 RING0_INT_FRAME 1014 RING0_INT_FRAME
1015 ASM_CLAC
987 pushl_cfi $0 1016 pushl_cfi $0
988 pushl_cfi $do_spurious_interrupt_bug 1017 pushl_cfi $do_spurious_interrupt_bug
989 jmp error_code 1018 jmp error_code
@@ -994,16 +1023,6 @@ END(spurious_interrupt_bug)
994 */ 1023 */
995 .popsection 1024 .popsection
996 1025
997ENTRY(kernel_thread_helper)
998 pushl $0 # fake return address for unwinder
999 CFI_STARTPROC
1000 movl %edi,%eax
1001 call *%esi
1002 call do_exit
1003 ud2 # padding for call trace
1004 CFI_ENDPROC
1005ENDPROC(kernel_thread_helper)
1006
1007#ifdef CONFIG_XEN 1026#ifdef CONFIG_XEN
1008/* Xen doesn't set %esp to be precisely what the normal sysenter 1027/* Xen doesn't set %esp to be precisely what the normal sysenter
1009 entrypoint expects, so fix it up before using the normal path. */ 1028 entrypoint expects, so fix it up before using the normal path. */
@@ -1111,17 +1130,21 @@ ENTRY(ftrace_caller)
1111 pushl %eax 1130 pushl %eax
1112 pushl %ecx 1131 pushl %ecx
1113 pushl %edx 1132 pushl %edx
1114 movl 0xc(%esp), %eax 1133 pushl $0 /* Pass NULL as regs pointer */
1134 movl 4*4(%esp), %eax
1115 movl 0x4(%ebp), %edx 1135 movl 0x4(%ebp), %edx
1136 leal function_trace_op, %ecx
1116 subl $MCOUNT_INSN_SIZE, %eax 1137 subl $MCOUNT_INSN_SIZE, %eax
1117 1138
1118.globl ftrace_call 1139.globl ftrace_call
1119ftrace_call: 1140ftrace_call:
1120 call ftrace_stub 1141 call ftrace_stub
1121 1142
1143 addl $4,%esp /* skip NULL pointer */
1122 popl %edx 1144 popl %edx
1123 popl %ecx 1145 popl %ecx
1124 popl %eax 1146 popl %eax
1147ftrace_ret:
1125#ifdef CONFIG_FUNCTION_GRAPH_TRACER 1148#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1126.globl ftrace_graph_call 1149.globl ftrace_graph_call
1127ftrace_graph_call: 1150ftrace_graph_call:
@@ -1133,6 +1156,71 @@ ftrace_stub:
1133 ret 1156 ret
1134END(ftrace_caller) 1157END(ftrace_caller)
1135 1158
1159ENTRY(ftrace_regs_caller)
1160 pushf /* push flags before compare (in cs location) */
1161 cmpl $0, function_trace_stop
1162 jne ftrace_restore_flags
1163
1164 /*
1165 * i386 does not save SS and ESP when coming from kernel.
1166 * Instead, to get sp, &regs->sp is used (see ptrace.h).
1167 * Unfortunately, that means eflags must be at the same location
1168 * as the current return ip is. We move the return ip into the
1169 * ip location, and move flags into the return ip location.
1170 */
1171 pushl 4(%esp) /* save return ip into ip slot */
1172
1173 pushl $0 /* Load 0 into orig_ax */
1174 pushl %gs
1175 pushl %fs
1176 pushl %es
1177 pushl %ds
1178 pushl %eax
1179 pushl %ebp
1180 pushl %edi
1181 pushl %esi
1182 pushl %edx
1183 pushl %ecx
1184 pushl %ebx
1185
1186 movl 13*4(%esp), %eax /* Get the saved flags */
1187 movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */
1188 /* clobbering return ip */
1189 movl $__KERNEL_CS,13*4(%esp)
1190
1191 movl 12*4(%esp), %eax /* Load ip (1st parameter) */
1192 subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
1193 movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
1194 leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
1195 pushl %esp /* Save pt_regs as 4th parameter */
1196
1197GLOBAL(ftrace_regs_call)
1198 call ftrace_stub
1199
1200 addl $4, %esp /* Skip pt_regs */
1201 movl 14*4(%esp), %eax /* Move flags back into cs */
1202 movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */
1203 movl 12*4(%esp), %eax /* Get return ip from regs->ip */
1204 movl %eax, 14*4(%esp) /* Put return ip back for ret */
1205
1206 popl %ebx
1207 popl %ecx
1208 popl %edx
1209 popl %esi
1210 popl %edi
1211 popl %ebp
1212 popl %eax
1213 popl %ds
1214 popl %es
1215 popl %fs
1216 popl %gs
1217 addl $8, %esp /* Skip orig_ax and ip */
1218 popf /* Pop flags at end (no addl to corrupt flags) */
1219 jmp ftrace_ret
1220
1221ftrace_restore_flags:
1222 popf
1223 jmp ftrace_stub
1136#else /* ! CONFIG_DYNAMIC_FTRACE */ 1224#else /* ! CONFIG_DYNAMIC_FTRACE */
1137 1225
1138ENTRY(mcount) 1226ENTRY(mcount)
@@ -1173,9 +1261,6 @@ END(mcount)
1173 1261
1174#ifdef CONFIG_FUNCTION_GRAPH_TRACER 1262#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1175ENTRY(ftrace_graph_caller) 1263ENTRY(ftrace_graph_caller)
1176 cmpl $0, function_trace_stop
1177 jne ftrace_stub
1178
1179 pushl %eax 1264 pushl %eax
1180 pushl %ecx 1265 pushl %ecx
1181 pushl %edx 1266 pushl %edx
@@ -1209,6 +1294,7 @@ return_to_handler:
1209 1294
1210ENTRY(page_fault) 1295ENTRY(page_fault)
1211 RING0_EC_FRAME 1296 RING0_EC_FRAME
1297 ASM_CLAC
1212 pushl_cfi $do_page_fault 1298 pushl_cfi $do_page_fault
1213 ALIGN 1299 ALIGN
1214error_code: 1300error_code:
@@ -1281,6 +1367,7 @@ END(page_fault)
1281 1367
1282ENTRY(debug) 1368ENTRY(debug)
1283 RING0_INT_FRAME 1369 RING0_INT_FRAME
1370 ASM_CLAC
1284 cmpl $ia32_sysenter_target,(%esp) 1371 cmpl $ia32_sysenter_target,(%esp)
1285 jne debug_stack_correct 1372 jne debug_stack_correct
1286 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn 1373 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
@@ -1305,6 +1392,7 @@ END(debug)
1305 */ 1392 */
1306ENTRY(nmi) 1393ENTRY(nmi)
1307 RING0_INT_FRAME 1394 RING0_INT_FRAME
1395 ASM_CLAC
1308 pushl_cfi %eax 1396 pushl_cfi %eax
1309 movl %ss, %eax 1397 movl %ss, %eax
1310 cmpw $__ESPFIX_SS, %ax 1398 cmpw $__ESPFIX_SS, %ax
@@ -1375,6 +1463,7 @@ END(nmi)
1375 1463
1376ENTRY(int3) 1464ENTRY(int3)
1377 RING0_INT_FRAME 1465 RING0_INT_FRAME
1466 ASM_CLAC
1378 pushl_cfi $-1 # mark this as an int 1467 pushl_cfi $-1 # mark this as an int
1379 SAVE_ALL 1468 SAVE_ALL
1380 TRACE_IRQS_OFF 1469 TRACE_IRQS_OFF
@@ -1395,6 +1484,7 @@ END(general_protection)
1395#ifdef CONFIG_KVM_GUEST 1484#ifdef CONFIG_KVM_GUEST
1396ENTRY(async_page_fault) 1485ENTRY(async_page_fault)
1397 RING0_EC_FRAME 1486 RING0_EC_FRAME
1487 ASM_CLAC
1398 pushl_cfi $do_async_page_fault 1488 pushl_cfi $do_async_page_fault
1399 jmp error_code 1489 jmp error_code
1400 CFI_ENDPROC 1490 CFI_ENDPROC
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index dcdd0ea33a32..b51b2c7ee51f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -56,6 +56,8 @@
56#include <asm/ftrace.h> 56#include <asm/ftrace.h>
57#include <asm/percpu.h> 57#include <asm/percpu.h>
58#include <asm/asm.h> 58#include <asm/asm.h>
59#include <asm/rcu.h>
60#include <asm/smap.h>
59#include <linux/err.h> 61#include <linux/err.h>
60 62
61/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 63/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -68,25 +70,51 @@
68 .section .entry.text, "ax" 70 .section .entry.text, "ax"
69 71
70#ifdef CONFIG_FUNCTION_TRACER 72#ifdef CONFIG_FUNCTION_TRACER
73
74#ifdef CC_USING_FENTRY
75# define function_hook __fentry__
76#else
77# define function_hook mcount
78#endif
79
71#ifdef CONFIG_DYNAMIC_FTRACE 80#ifdef CONFIG_DYNAMIC_FTRACE
72ENTRY(mcount) 81
82ENTRY(function_hook)
73 retq 83 retq
74END(mcount) 84END(function_hook)
85
86/* skip is set if stack has been adjusted */
87.macro ftrace_caller_setup skip=0
88 MCOUNT_SAVE_FRAME \skip
89
90 /* Load the ftrace_ops into the 3rd parameter */
91 leaq function_trace_op, %rdx
92
93 /* Load ip into the first parameter */
94 movq RIP(%rsp), %rdi
95 subq $MCOUNT_INSN_SIZE, %rdi
96 /* Load the parent_ip into the second parameter */
97#ifdef CC_USING_FENTRY
98 movq SS+16(%rsp), %rsi
99#else
100 movq 8(%rbp), %rsi
101#endif
102.endm
75 103
76ENTRY(ftrace_caller) 104ENTRY(ftrace_caller)
105 /* Check if tracing was disabled (quick check) */
77 cmpl $0, function_trace_stop 106 cmpl $0, function_trace_stop
78 jne ftrace_stub 107 jne ftrace_stub
79 108
80 MCOUNT_SAVE_FRAME 109 ftrace_caller_setup
81 110 /* regs go into 4th parameter (but make it NULL) */
82 movq 0x38(%rsp), %rdi 111 movq $0, %rcx
83 movq 8(%rbp), %rsi
84 subq $MCOUNT_INSN_SIZE, %rdi
85 112
86GLOBAL(ftrace_call) 113GLOBAL(ftrace_call)
87 call ftrace_stub 114 call ftrace_stub
88 115
89 MCOUNT_RESTORE_FRAME 116 MCOUNT_RESTORE_FRAME
117ftrace_return:
90 118
91#ifdef CONFIG_FUNCTION_GRAPH_TRACER 119#ifdef CONFIG_FUNCTION_GRAPH_TRACER
92GLOBAL(ftrace_graph_call) 120GLOBAL(ftrace_graph_call)
@@ -97,8 +125,78 @@ GLOBAL(ftrace_stub)
97 retq 125 retq
98END(ftrace_caller) 126END(ftrace_caller)
99 127
128ENTRY(ftrace_regs_caller)
129 /* Save the current flags before compare (in SS location)*/
130 pushfq
131
132 /* Check if tracing was disabled (quick check) */
133 cmpl $0, function_trace_stop
134 jne ftrace_restore_flags
135
136 /* skip=8 to skip flags saved in SS */
137 ftrace_caller_setup 8
138
139 /* Save the rest of pt_regs */
140 movq %r15, R15(%rsp)
141 movq %r14, R14(%rsp)
142 movq %r13, R13(%rsp)
143 movq %r12, R12(%rsp)
144 movq %r11, R11(%rsp)
145 movq %r10, R10(%rsp)
146 movq %rbp, RBP(%rsp)
147 movq %rbx, RBX(%rsp)
148 /* Copy saved flags */
149 movq SS(%rsp), %rcx
150 movq %rcx, EFLAGS(%rsp)
151 /* Kernel segments */
152 movq $__KERNEL_DS, %rcx
153 movq %rcx, SS(%rsp)
154 movq $__KERNEL_CS, %rcx
155 movq %rcx, CS(%rsp)
156 /* Stack - skipping return address */
157 leaq SS+16(%rsp), %rcx
158 movq %rcx, RSP(%rsp)
159
160 /* regs go into 4th parameter */
161 leaq (%rsp), %rcx
162
163GLOBAL(ftrace_regs_call)
164 call ftrace_stub
165
166 /* Copy flags back to SS, to restore them */
167 movq EFLAGS(%rsp), %rax
168 movq %rax, SS(%rsp)
169
170 /* Handlers can change the RIP */
171 movq RIP(%rsp), %rax
172 movq %rax, SS+8(%rsp)
173
174 /* restore the rest of pt_regs */
175 movq R15(%rsp), %r15
176 movq R14(%rsp), %r14
177 movq R13(%rsp), %r13
178 movq R12(%rsp), %r12
179 movq R10(%rsp), %r10
180 movq RBP(%rsp), %rbp
181 movq RBX(%rsp), %rbx
182
183 /* skip=8 to skip flags saved in SS */
184 MCOUNT_RESTORE_FRAME 8
185
186 /* Restore flags */
187 popfq
188
189 jmp ftrace_return
190ftrace_restore_flags:
191 popfq
192 jmp ftrace_stub
193
194END(ftrace_regs_caller)
195
196
100#else /* ! CONFIG_DYNAMIC_FTRACE */ 197#else /* ! CONFIG_DYNAMIC_FTRACE */
101ENTRY(mcount) 198
199ENTRY(function_hook)
102 cmpl $0, function_trace_stop 200 cmpl $0, function_trace_stop
103 jne ftrace_stub 201 jne ftrace_stub
104 202
@@ -119,8 +217,12 @@ GLOBAL(ftrace_stub)
119trace: 217trace:
120 MCOUNT_SAVE_FRAME 218 MCOUNT_SAVE_FRAME
121 219
122 movq 0x38(%rsp), %rdi 220 movq RIP(%rsp), %rdi
221#ifdef CC_USING_FENTRY
222 movq SS+16(%rsp), %rsi
223#else
123 movq 8(%rbp), %rsi 224 movq 8(%rbp), %rsi
225#endif
124 subq $MCOUNT_INSN_SIZE, %rdi 226 subq $MCOUNT_INSN_SIZE, %rdi
125 227
126 call *ftrace_trace_function 228 call *ftrace_trace_function
@@ -128,20 +230,22 @@ trace:
128 MCOUNT_RESTORE_FRAME 230 MCOUNT_RESTORE_FRAME
129 231
130 jmp ftrace_stub 232 jmp ftrace_stub
131END(mcount) 233END(function_hook)
132#endif /* CONFIG_DYNAMIC_FTRACE */ 234#endif /* CONFIG_DYNAMIC_FTRACE */
133#endif /* CONFIG_FUNCTION_TRACER */ 235#endif /* CONFIG_FUNCTION_TRACER */
134 236
135#ifdef CONFIG_FUNCTION_GRAPH_TRACER 237#ifdef CONFIG_FUNCTION_GRAPH_TRACER
136ENTRY(ftrace_graph_caller) 238ENTRY(ftrace_graph_caller)
137 cmpl $0, function_trace_stop
138 jne ftrace_stub
139
140 MCOUNT_SAVE_FRAME 239 MCOUNT_SAVE_FRAME
141 240
241#ifdef CC_USING_FENTRY
242 leaq SS+16(%rsp), %rdi
243 movq $0, %rdx /* No framepointers needed */
244#else
142 leaq 8(%rbp), %rdi 245 leaq 8(%rbp), %rdi
143 movq 0x38(%rsp), %rsi
144 movq (%rbp), %rdx 246 movq (%rbp), %rdx
247#endif
248 movq RIP(%rsp), %rsi
145 subq $MCOUNT_INSN_SIZE, %rsi 249 subq $MCOUNT_INSN_SIZE, %rsi
146 250
147 call prepare_ftrace_return 251 call prepare_ftrace_return
@@ -342,15 +446,15 @@ ENDPROC(native_usergs_sysret64)
342 .macro SAVE_ARGS_IRQ 446 .macro SAVE_ARGS_IRQ
343 cld 447 cld
344 /* start from rbp in pt_regs and jump over */ 448 /* start from rbp in pt_regs and jump over */
345 movq_cfi rdi, RDI-RBP 449 movq_cfi rdi, (RDI-RBP)
346 movq_cfi rsi, RSI-RBP 450 movq_cfi rsi, (RSI-RBP)
347 movq_cfi rdx, RDX-RBP 451 movq_cfi rdx, (RDX-RBP)
348 movq_cfi rcx, RCX-RBP 452 movq_cfi rcx, (RCX-RBP)
349 movq_cfi rax, RAX-RBP 453 movq_cfi rax, (RAX-RBP)
350 movq_cfi r8, R8-RBP 454 movq_cfi r8, (R8-RBP)
351 movq_cfi r9, R9-RBP 455 movq_cfi r9, (R9-RBP)
352 movq_cfi r10, R10-RBP 456 movq_cfi r10, (R10-RBP)
353 movq_cfi r11, R11-RBP 457 movq_cfi r11, (R11-RBP)
354 458
355 /* Save rbp so that we can unwind from get_irq_regs() */ 459 /* Save rbp so that we can unwind from get_irq_regs() */
356 movq_cfi rbp, 0 460 movq_cfi rbp, 0
@@ -384,7 +488,7 @@ ENDPROC(native_usergs_sysret64)
384 .endm 488 .endm
385 489
386ENTRY(save_rest) 490ENTRY(save_rest)
387 PARTIAL_FRAME 1 REST_SKIP+8 491 PARTIAL_FRAME 1 (REST_SKIP+8)
388 movq 5*8+16(%rsp), %r11 /* save return address */ 492 movq 5*8+16(%rsp), %r11 /* save return address */
389 movq_cfi rbx, RBX+16 493 movq_cfi rbx, RBX+16
390 movq_cfi rbp, RBP+16 494 movq_cfi rbp, RBP+16
@@ -440,7 +544,7 @@ ENTRY(ret_from_fork)
440 544
441 LOCK ; btr $TIF_FORK,TI_flags(%r8) 545 LOCK ; btr $TIF_FORK,TI_flags(%r8)
442 546
443 pushq_cfi kernel_eflags(%rip) 547 pushq_cfi $0x0002
444 popfq_cfi # reset kernel eflags 548 popfq_cfi # reset kernel eflags
445 549
446 call schedule_tail # rdi: 'prev' task parameter 550 call schedule_tail # rdi: 'prev' task parameter
@@ -450,7 +554,7 @@ ENTRY(ret_from_fork)
450 RESTORE_REST 554 RESTORE_REST
451 555
452 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? 556 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
453 jz retint_restore_args 557 jz 1f
454 558
455 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET 559 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
456 jnz int_ret_from_sys_call 560 jnz int_ret_from_sys_call
@@ -458,6 +562,14 @@ ENTRY(ret_from_fork)
458 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET 562 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
459 jmp ret_from_sys_call # go to the SYSRET fastpath 563 jmp ret_from_sys_call # go to the SYSRET fastpath
460 564
5651:
566 subq $REST_SKIP, %rsp # leave space for volatiles
567 CFI_ADJUST_CFA_OFFSET REST_SKIP
568 movq %rbp, %rdi
569 call *%rbx
570 movl $0, RAX(%rsp)
571 RESTORE_REST
572 jmp int_ret_from_sys_call
461 CFI_ENDPROC 573 CFI_ENDPROC
462END(ret_from_fork) 574END(ret_from_fork)
463 575
@@ -465,7 +577,8 @@ END(ret_from_fork)
465 * System call entry. Up to 6 arguments in registers are supported. 577 * System call entry. Up to 6 arguments in registers are supported.
466 * 578 *
467 * SYSCALL does not save anything on the stack and does not change the 579 * SYSCALL does not save anything on the stack and does not change the
468 * stack pointer. 580 * stack pointer. However, it does mask the flags register for us, so
581 * CLD and CLAC are not needed.
469 */ 582 */
470 583
471/* 584/*
@@ -565,7 +678,7 @@ sysret_careful:
565 TRACE_IRQS_ON 678 TRACE_IRQS_ON
566 ENABLE_INTERRUPTS(CLBR_NONE) 679 ENABLE_INTERRUPTS(CLBR_NONE)
567 pushq_cfi %rdi 680 pushq_cfi %rdi
568 call schedule 681 SCHEDULE_USER
569 popq_cfi %rdi 682 popq_cfi %rdi
570 jmp sysret_check 683 jmp sysret_check
571 684
@@ -678,7 +791,7 @@ int_careful:
678 TRACE_IRQS_ON 791 TRACE_IRQS_ON
679 ENABLE_INTERRUPTS(CLBR_NONE) 792 ENABLE_INTERRUPTS(CLBR_NONE)
680 pushq_cfi %rdi 793 pushq_cfi %rdi
681 call schedule 794 SCHEDULE_USER
682 popq_cfi %rdi 795 popq_cfi %rdi
683 DISABLE_INTERRUPTS(CLBR_NONE) 796 DISABLE_INTERRUPTS(CLBR_NONE)
684 TRACE_IRQS_OFF 797 TRACE_IRQS_OFF
@@ -757,7 +870,6 @@ ENTRY(stub_execve)
757 PARTIAL_FRAME 0 870 PARTIAL_FRAME 0
758 SAVE_REST 871 SAVE_REST
759 FIXUP_TOP_OF_STACK %r11 872 FIXUP_TOP_OF_STACK %r11
760 movq %rsp, %rcx
761 call sys_execve 873 call sys_execve
762 RESTORE_TOP_OF_STACK %r11 874 RESTORE_TOP_OF_STACK %r11
763 movq %rax,RAX(%rsp) 875 movq %rax,RAX(%rsp)
@@ -807,8 +919,7 @@ ENTRY(stub_x32_execve)
807 PARTIAL_FRAME 0 919 PARTIAL_FRAME 0
808 SAVE_REST 920 SAVE_REST
809 FIXUP_TOP_OF_STACK %r11 921 FIXUP_TOP_OF_STACK %r11
810 movq %rsp, %rcx 922 call compat_sys_execve
811 call sys32_execve
812 RESTORE_TOP_OF_STACK %r11 923 RESTORE_TOP_OF_STACK %r11
813 movq %rax,RAX(%rsp) 924 movq %rax,RAX(%rsp)
814 RESTORE_REST 925 RESTORE_REST
@@ -884,6 +995,7 @@ END(interrupt)
884 */ 995 */
885 .p2align CONFIG_X86_L1_CACHE_SHIFT 996 .p2align CONFIG_X86_L1_CACHE_SHIFT
886common_interrupt: 997common_interrupt:
998 ASM_CLAC
887 XCPT_FRAME 999 XCPT_FRAME
888 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ 1000 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
889 interrupt do_IRQ 1001 interrupt do_IRQ
@@ -974,7 +1086,7 @@ retint_careful:
974 TRACE_IRQS_ON 1086 TRACE_IRQS_ON
975 ENABLE_INTERRUPTS(CLBR_NONE) 1087 ENABLE_INTERRUPTS(CLBR_NONE)
976 pushq_cfi %rdi 1088 pushq_cfi %rdi
977 call schedule 1089 SCHEDULE_USER
978 popq_cfi %rdi 1090 popq_cfi %rdi
979 GET_THREAD_INFO(%rcx) 1091 GET_THREAD_INFO(%rcx)
980 DISABLE_INTERRUPTS(CLBR_NONE) 1092 DISABLE_INTERRUPTS(CLBR_NONE)
@@ -1023,6 +1135,7 @@ END(common_interrupt)
1023 */ 1135 */
1024.macro apicinterrupt num sym do_sym 1136.macro apicinterrupt num sym do_sym
1025ENTRY(\sym) 1137ENTRY(\sym)
1138 ASM_CLAC
1026 INTR_FRAME 1139 INTR_FRAME
1027 pushq_cfi $~(\num) 1140 pushq_cfi $~(\num)
1028.Lcommon_\sym: 1141.Lcommon_\sym:
@@ -1077,6 +1190,7 @@ apicinterrupt IRQ_WORK_VECTOR \
1077 */ 1190 */
1078.macro zeroentry sym do_sym 1191.macro zeroentry sym do_sym
1079ENTRY(\sym) 1192ENTRY(\sym)
1193 ASM_CLAC
1080 INTR_FRAME 1194 INTR_FRAME
1081 PARAVIRT_ADJUST_EXCEPTION_FRAME 1195 PARAVIRT_ADJUST_EXCEPTION_FRAME
1082 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1196 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
@@ -1094,6 +1208,7 @@ END(\sym)
1094 1208
1095.macro paranoidzeroentry sym do_sym 1209.macro paranoidzeroentry sym do_sym
1096ENTRY(\sym) 1210ENTRY(\sym)
1211 ASM_CLAC
1097 INTR_FRAME 1212 INTR_FRAME
1098 PARAVIRT_ADJUST_EXCEPTION_FRAME 1213 PARAVIRT_ADJUST_EXCEPTION_FRAME
1099 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1214 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
@@ -1112,6 +1227,7 @@ END(\sym)
1112#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) 1227#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
1113.macro paranoidzeroentry_ist sym do_sym ist 1228.macro paranoidzeroentry_ist sym do_sym ist
1114ENTRY(\sym) 1229ENTRY(\sym)
1230 ASM_CLAC
1115 INTR_FRAME 1231 INTR_FRAME
1116 PARAVIRT_ADJUST_EXCEPTION_FRAME 1232 PARAVIRT_ADJUST_EXCEPTION_FRAME
1117 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1233 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
@@ -1131,6 +1247,7 @@ END(\sym)
1131 1247
1132.macro errorentry sym do_sym 1248.macro errorentry sym do_sym
1133ENTRY(\sym) 1249ENTRY(\sym)
1250 ASM_CLAC
1134 XCPT_FRAME 1251 XCPT_FRAME
1135 PARAVIRT_ADJUST_EXCEPTION_FRAME 1252 PARAVIRT_ADJUST_EXCEPTION_FRAME
1136 subq $ORIG_RAX-R15, %rsp 1253 subq $ORIG_RAX-R15, %rsp
@@ -1149,6 +1266,7 @@ END(\sym)
1149 /* error code is on the stack already */ 1266 /* error code is on the stack already */
1150.macro paranoiderrorentry sym do_sym 1267.macro paranoiderrorentry sym do_sym
1151ENTRY(\sym) 1268ENTRY(\sym)
1269 ASM_CLAC
1152 XCPT_FRAME 1270 XCPT_FRAME
1153 PARAVIRT_ADJUST_EXCEPTION_FRAME 1271 PARAVIRT_ADJUST_EXCEPTION_FRAME
1154 subq $ORIG_RAX-R15, %rsp 1272 subq $ORIG_RAX-R15, %rsp
@@ -1206,52 +1324,6 @@ bad_gs:
1206 jmp 2b 1324 jmp 2b
1207 .previous 1325 .previous
1208 1326
1209ENTRY(kernel_thread_helper)
1210 pushq $0 # fake return address
1211 CFI_STARTPROC
1212 /*
1213 * Here we are in the child and the registers are set as they were
1214 * at kernel_thread() invocation in the parent.
1215 */
1216 call *%rsi
1217 # exit
1218 mov %eax, %edi
1219 call do_exit
1220 ud2 # padding for call trace
1221 CFI_ENDPROC
1222END(kernel_thread_helper)
1223
1224/*
1225 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
1226 *
1227 * C extern interface:
1228 * extern long execve(const char *name, char **argv, char **envp)
1229 *
1230 * asm input arguments:
1231 * rdi: name, rsi: argv, rdx: envp
1232 *
1233 * We want to fallback into:
1234 * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
1235 *
1236 * do_sys_execve asm fallback arguments:
1237 * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
1238 */
1239ENTRY(kernel_execve)
1240 CFI_STARTPROC
1241 FAKE_STACK_FRAME $0
1242 SAVE_ALL
1243 movq %rsp,%rcx
1244 call sys_execve
1245 movq %rax, RAX(%rsp)
1246 RESTORE_REST
1247 testq %rax,%rax
1248 je int_ret_from_sys_call
1249 RESTORE_ARGS
1250 UNFAKE_STACK_FRAME
1251 ret
1252 CFI_ENDPROC
1253END(kernel_execve)
1254
1255/* Call softirq on interrupt stack. Interrupts are off. */ 1327/* Call softirq on interrupt stack. Interrupts are off. */
1256ENTRY(call_softirq) 1328ENTRY(call_softirq)
1257 CFI_STARTPROC 1329 CFI_STARTPROC
@@ -1449,7 +1521,7 @@ paranoid_userspace:
1449paranoid_schedule: 1521paranoid_schedule:
1450 TRACE_IRQS_ON 1522 TRACE_IRQS_ON
1451 ENABLE_INTERRUPTS(CLBR_ANY) 1523 ENABLE_INTERRUPTS(CLBR_ANY)
1452 call schedule 1524 SCHEDULE_USER
1453 DISABLE_INTERRUPTS(CLBR_ANY) 1525 DISABLE_INTERRUPTS(CLBR_ANY)
1454 TRACE_IRQS_OFF 1526 TRACE_IRQS_OFF
1455 jmp paranoid_userspace 1527 jmp paranoid_userspace
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index c3a7cb4bf6e6..1d414029f1d8 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -206,6 +206,21 @@ static int
206ftrace_modify_code(unsigned long ip, unsigned const char *old_code, 206ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
207 unsigned const char *new_code); 207 unsigned const char *new_code);
208 208
209/*
210 * Should never be called:
211 * As it is only called by __ftrace_replace_code() which is called by
212 * ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
213 * which is called to turn mcount into nops or nops into function calls
214 * but not to convert a function from not using regs to one that uses
215 * regs, which ftrace_modify_call() is for.
216 */
217int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
218 unsigned long addr)
219{
220 WARN_ON(1);
221 return -EINVAL;
222}
223
209int ftrace_update_ftrace_func(ftrace_func_t func) 224int ftrace_update_ftrace_func(ftrace_func_t func)
210{ 225{
211 unsigned long ip = (unsigned long)(&ftrace_call); 226 unsigned long ip = (unsigned long)(&ftrace_call);
@@ -220,6 +235,14 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
220 235
221 ret = ftrace_modify_code(ip, old, new); 236 ret = ftrace_modify_code(ip, old, new);
222 237
238 /* Also update the regs callback function */
239 if (!ret) {
240 ip = (unsigned long)(&ftrace_regs_call);
241 memcpy(old, &ftrace_regs_call, MCOUNT_INSN_SIZE);
242 new = ftrace_call_replace(ip, (unsigned long)func);
243 ret = ftrace_modify_code(ip, old, new);
244 }
245
223 atomic_dec(&modifying_ftrace_code); 246 atomic_dec(&modifying_ftrace_code);
224 247
225 return ret; 248 return ret;
@@ -299,6 +322,32 @@ static int add_brk_on_nop(struct dyn_ftrace *rec)
299 return add_break(rec->ip, old); 322 return add_break(rec->ip, old);
300} 323}
301 324
325/*
326 * If the record has the FTRACE_FL_REGS set, that means that it
327 * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
328 * is not not set, then it wants to convert to the normal callback.
329 */
330static unsigned long get_ftrace_addr(struct dyn_ftrace *rec)
331{
332 if (rec->flags & FTRACE_FL_REGS)
333 return (unsigned long)FTRACE_REGS_ADDR;
334 else
335 return (unsigned long)FTRACE_ADDR;
336}
337
338/*
339 * The FTRACE_FL_REGS_EN is set when the record already points to
340 * a function that saves all the regs. Basically the '_EN' version
341 * represents the current state of the function.
342 */
343static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec)
344{
345 if (rec->flags & FTRACE_FL_REGS_EN)
346 return (unsigned long)FTRACE_REGS_ADDR;
347 else
348 return (unsigned long)FTRACE_ADDR;
349}
350
302static int add_breakpoints(struct dyn_ftrace *rec, int enable) 351static int add_breakpoints(struct dyn_ftrace *rec, int enable)
303{ 352{
304 unsigned long ftrace_addr; 353 unsigned long ftrace_addr;
@@ -306,7 +355,7 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable)
306 355
307 ret = ftrace_test_record(rec, enable); 356 ret = ftrace_test_record(rec, enable);
308 357
309 ftrace_addr = (unsigned long)FTRACE_ADDR; 358 ftrace_addr = get_ftrace_addr(rec);
310 359
311 switch (ret) { 360 switch (ret) {
312 case FTRACE_UPDATE_IGNORE: 361 case FTRACE_UPDATE_IGNORE:
@@ -316,6 +365,10 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable)
316 /* converting nop to call */ 365 /* converting nop to call */
317 return add_brk_on_nop(rec); 366 return add_brk_on_nop(rec);
318 367
368 case FTRACE_UPDATE_MODIFY_CALL_REGS:
369 case FTRACE_UPDATE_MODIFY_CALL:
370 ftrace_addr = get_ftrace_old_addr(rec);
371 /* fall through */
319 case FTRACE_UPDATE_MAKE_NOP: 372 case FTRACE_UPDATE_MAKE_NOP:
320 /* converting a call to a nop */ 373 /* converting a call to a nop */
321 return add_brk_on_call(rec, ftrace_addr); 374 return add_brk_on_call(rec, ftrace_addr);
@@ -360,13 +413,21 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
360 * If not, don't touch the breakpoint, we make just create 413 * If not, don't touch the breakpoint, we make just create
361 * a disaster. 414 * a disaster.
362 */ 415 */
363 ftrace_addr = (unsigned long)FTRACE_ADDR; 416 ftrace_addr = get_ftrace_addr(rec);
417 nop = ftrace_call_replace(ip, ftrace_addr);
418
419 if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
420 goto update;
421
422 /* Check both ftrace_addr and ftrace_old_addr */
423 ftrace_addr = get_ftrace_old_addr(rec);
364 nop = ftrace_call_replace(ip, ftrace_addr); 424 nop = ftrace_call_replace(ip, ftrace_addr);
365 425
366 if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) 426 if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
367 return -EINVAL; 427 return -EINVAL;
368 } 428 }
369 429
430 update:
370 return probe_kernel_write((void *)ip, &nop[0], 1); 431 return probe_kernel_write((void *)ip, &nop[0], 1);
371} 432}
372 433
@@ -405,12 +466,14 @@ static int add_update(struct dyn_ftrace *rec, int enable)
405 466
406 ret = ftrace_test_record(rec, enable); 467 ret = ftrace_test_record(rec, enable);
407 468
408 ftrace_addr = (unsigned long)FTRACE_ADDR; 469 ftrace_addr = get_ftrace_addr(rec);
409 470
410 switch (ret) { 471 switch (ret) {
411 case FTRACE_UPDATE_IGNORE: 472 case FTRACE_UPDATE_IGNORE:
412 return 0; 473 return 0;
413 474
475 case FTRACE_UPDATE_MODIFY_CALL_REGS:
476 case FTRACE_UPDATE_MODIFY_CALL:
414 case FTRACE_UPDATE_MAKE_CALL: 477 case FTRACE_UPDATE_MAKE_CALL:
415 /* converting nop to call */ 478 /* converting nop to call */
416 return add_update_call(rec, ftrace_addr); 479 return add_update_call(rec, ftrace_addr);
@@ -455,12 +518,14 @@ static int finish_update(struct dyn_ftrace *rec, int enable)
455 518
456 ret = ftrace_update_record(rec, enable); 519 ret = ftrace_update_record(rec, enable);
457 520
458 ftrace_addr = (unsigned long)FTRACE_ADDR; 521 ftrace_addr = get_ftrace_addr(rec);
459 522
460 switch (ret) { 523 switch (ret) {
461 case FTRACE_UPDATE_IGNORE: 524 case FTRACE_UPDATE_IGNORE:
462 return 0; 525 return 0;
463 526
527 case FTRACE_UPDATE_MODIFY_CALL_REGS:
528 case FTRACE_UPDATE_MODIFY_CALL:
464 case FTRACE_UPDATE_MAKE_CALL: 529 case FTRACE_UPDATE_MAKE_CALL:
465 /* converting nop to call */ 530 /* converting nop to call */
466 return finish_update_call(rec, ftrace_addr); 531 return finish_update_call(rec, ftrace_addr);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index d42ab17b7397..957a47aec64e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -287,27 +287,28 @@ ENTRY(startup_32_smp)
287 leal -__PAGE_OFFSET(%ecx),%esp 287 leal -__PAGE_OFFSET(%ecx),%esp
288 288
289default_entry: 289default_entry:
290
291/* 290/*
292 * New page tables may be in 4Mbyte page mode and may 291 * New page tables may be in 4Mbyte page mode and may
293 * be using the global pages. 292 * be using the global pages.
294 * 293 *
295 * NOTE! If we are on a 486 we may have no cr4 at all! 294 * NOTE! If we are on a 486 we may have no cr4 at all!
296 * So we do not try to touch it unless we really have 295 * Specifically, cr4 exists if and only if CPUID exists,
297 * some bits in it to set. This won't work if the BSP 296 * which in turn exists if and only if EFLAGS.ID exists.
298 * implements cr4 but this AP does not -- very unlikely
299 * but be warned! The same applies to the pse feature
300 * if not equally supported. --macro
301 *
302 * NOTE! We have to correct for the fact that we're
303 * not yet offset PAGE_OFFSET..
304 */ 297 */
305#define cr4_bits pa(mmu_cr4_features) 298 movl $X86_EFLAGS_ID,%ecx
306 movl cr4_bits,%edx 299 pushl %ecx
307 andl %edx,%edx 300 popfl
308 jz 6f 301 pushfl
309 movl %cr4,%eax # Turn on paging options (PSE,PAE,..) 302 popl %eax
310 orl %edx,%eax 303 pushl $0
304 popfl
305 pushfl
306 popl %edx
307 xorl %edx,%eax
308 testl %ecx,%eax
309 jz 6f # No ID flag = no CPUID = no CR4
310
311 movl pa(mmu_cr4_features),%eax
311 movl %eax,%cr4 312 movl %eax,%cr4
312 313
313 testb $X86_CR4_PAE, %al # check if PAE is enabled 314 testb $X86_CR4_PAE, %al # check if PAE is enabled
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index f250431fb505..675a05012449 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -19,24 +19,17 @@
19#include <asm/fpu-internal.h> 19#include <asm/fpu-internal.h>
20#include <asm/user.h> 20#include <asm/user.h>
21 21
22#ifdef CONFIG_X86_64
23# include <asm/sigcontext32.h>
24# include <asm/user32.h>
25#else
26# define save_i387_xstate_ia32 save_i387_xstate
27# define restore_i387_xstate_ia32 restore_i387_xstate
28# define _fpstate_ia32 _fpstate
29# define _xstate_ia32 _xstate
30# define sig_xstate_ia32_size sig_xstate_size
31# define fx_sw_reserved_ia32 fx_sw_reserved
32# define user_i387_ia32_struct user_i387_struct
33# define user32_fxsr_struct user_fxsr_struct
34#endif
35
36/* 22/*
37 * Were we in an interrupt that interrupted kernel mode? 23 * Were we in an interrupt that interrupted kernel mode?
38 * 24 *
39 * We can do a kernel_fpu_begin/end() pair *ONLY* if that 25 * For now, with eagerfpu we will return interrupted kernel FPU
26 * state as not-idle. TBD: Ideally we can change the return value
27 * to something like __thread_has_fpu(current). But we need to
28 * be careful of doing __thread_clear_has_fpu() before saving
29 * the FPU etc for supporting nested uses etc. For now, take
30 * the simple route!
31 *
32 * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that
40 * pair does nothing at all: the thread must not have fpu (so 33 * pair does nothing at all: the thread must not have fpu (so
41 * that we don't try to save the FPU state), and TS must 34 * that we don't try to save the FPU state), and TS must
42 * be set (so that the clts/stts pair does nothing that is 35 * be set (so that the clts/stts pair does nothing that is
@@ -44,6 +37,9 @@
44 */ 37 */
45static inline bool interrupted_kernel_fpu_idle(void) 38static inline bool interrupted_kernel_fpu_idle(void)
46{ 39{
40 if (use_eager_fpu())
41 return 0;
42
47 return !__thread_has_fpu(current) && 43 return !__thread_has_fpu(current) &&
48 (read_cr0() & X86_CR0_TS); 44 (read_cr0() & X86_CR0_TS);
49} 45}
@@ -77,29 +73,29 @@ bool irq_fpu_usable(void)
77} 73}
78EXPORT_SYMBOL(irq_fpu_usable); 74EXPORT_SYMBOL(irq_fpu_usable);
79 75
80void kernel_fpu_begin(void) 76void __kernel_fpu_begin(void)
81{ 77{
82 struct task_struct *me = current; 78 struct task_struct *me = current;
83 79
84 WARN_ON_ONCE(!irq_fpu_usable());
85 preempt_disable();
86 if (__thread_has_fpu(me)) { 80 if (__thread_has_fpu(me)) {
87 __save_init_fpu(me); 81 __save_init_fpu(me);
88 __thread_clear_has_fpu(me); 82 __thread_clear_has_fpu(me);
89 /* We do 'stts()' in kernel_fpu_end() */ 83 /* We do 'stts()' in __kernel_fpu_end() */
90 } else { 84 } else if (!use_eager_fpu()) {
91 this_cpu_write(fpu_owner_task, NULL); 85 this_cpu_write(fpu_owner_task, NULL);
92 clts(); 86 clts();
93 } 87 }
94} 88}
95EXPORT_SYMBOL(kernel_fpu_begin); 89EXPORT_SYMBOL(__kernel_fpu_begin);
96 90
97void kernel_fpu_end(void) 91void __kernel_fpu_end(void)
98{ 92{
99 stts(); 93 if (use_eager_fpu())
100 preempt_enable(); 94 math_state_restore();
95 else
96 stts();
101} 97}
102EXPORT_SYMBOL(kernel_fpu_end); 98EXPORT_SYMBOL(__kernel_fpu_end);
103 99
104void unlazy_fpu(struct task_struct *tsk) 100void unlazy_fpu(struct task_struct *tsk)
105{ 101{
@@ -113,23 +109,15 @@ void unlazy_fpu(struct task_struct *tsk)
113} 109}
114EXPORT_SYMBOL(unlazy_fpu); 110EXPORT_SYMBOL(unlazy_fpu);
115 111
116#ifdef CONFIG_MATH_EMULATION 112unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
117# define HAVE_HWFP (boot_cpu_data.hard_math)
118#else
119# define HAVE_HWFP 1
120#endif
121
122static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
123unsigned int xstate_size; 113unsigned int xstate_size;
124EXPORT_SYMBOL_GPL(xstate_size); 114EXPORT_SYMBOL_GPL(xstate_size);
125unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
126static struct i387_fxsave_struct fx_scratch __cpuinitdata; 115static struct i387_fxsave_struct fx_scratch __cpuinitdata;
127 116
128static void __cpuinit mxcsr_feature_mask_init(void) 117static void __cpuinit mxcsr_feature_mask_init(void)
129{ 118{
130 unsigned long mask = 0; 119 unsigned long mask = 0;
131 120
132 clts();
133 if (cpu_has_fxsr) { 121 if (cpu_has_fxsr) {
134 memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct)); 122 memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
135 asm volatile("fxsave %0" : : "m" (fx_scratch)); 123 asm volatile("fxsave %0" : : "m" (fx_scratch));
@@ -138,7 +126,6 @@ static void __cpuinit mxcsr_feature_mask_init(void)
138 mask = 0x0000ffbf; 126 mask = 0x0000ffbf;
139 } 127 }
140 mxcsr_feature_mask &= mask; 128 mxcsr_feature_mask &= mask;
141 stts();
142} 129}
143 130
144static void __cpuinit init_thread_xstate(void) 131static void __cpuinit init_thread_xstate(void)
@@ -192,9 +179,8 @@ void __cpuinit fpu_init(void)
192 init_thread_xstate(); 179 init_thread_xstate();
193 180
194 mxcsr_feature_mask_init(); 181 mxcsr_feature_mask_init();
195 /* clean state in init */ 182 xsave_init();
196 current_thread_info()->status = 0; 183 eager_fpu_init();
197 clear_used_math();
198} 184}
199 185
200void fpu_finit(struct fpu *fpu) 186void fpu_finit(struct fpu *fpu)
@@ -205,12 +191,7 @@ void fpu_finit(struct fpu *fpu)
205 } 191 }
206 192
207 if (cpu_has_fxsr) { 193 if (cpu_has_fxsr) {
208 struct i387_fxsave_struct *fx = &fpu->state->fxsave; 194 fx_finit(&fpu->state->fxsave);
209
210 memset(fx, 0, xstate_size);
211 fx->cwd = 0x37f;
212 if (cpu_has_xmm)
213 fx->mxcsr = MXCSR_DEFAULT;
214 } else { 195 } else {
215 struct i387_fsave_struct *fp = &fpu->state->fsave; 196 struct i387_fsave_struct *fp = &fpu->state->fsave;
216 memset(fp, 0, xstate_size); 197 memset(fp, 0, xstate_size);
@@ -454,7 +435,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
454 * FXSR floating point environment conversions. 435 * FXSR floating point environment conversions.
455 */ 436 */
456 437
457static void 438void
458convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) 439convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
459{ 440{
460 struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; 441 struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
@@ -491,8 +472,8 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
491 memcpy(&to[i], &from[i], sizeof(to[0])); 472 memcpy(&to[i], &from[i], sizeof(to[0]));
492} 473}
493 474
494static void convert_to_fxsr(struct task_struct *tsk, 475void convert_to_fxsr(struct task_struct *tsk,
495 const struct user_i387_ia32_struct *env) 476 const struct user_i387_ia32_struct *env)
496 477
497{ 478{
498 struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; 479 struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
@@ -589,223 +570,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
589} 570}
590 571
591/* 572/*
592 * Signal frame handlers.
593 */
594
595static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
596{
597 struct task_struct *tsk = current;
598 struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave;
599
600 fp->status = fp->swd;
601 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
602 return -1;
603 return 1;
604}
605
606static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
607{
608 struct task_struct *tsk = current;
609 struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
610 struct user_i387_ia32_struct env;
611 int err = 0;
612
613 convert_from_fxsr(&env, tsk);
614 if (__copy_to_user(buf, &env, sizeof(env)))
615 return -1;
616
617 err |= __put_user(fx->swd, &buf->status);
618 err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
619 if (err)
620 return -1;
621
622 if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size))
623 return -1;
624 return 1;
625}
626
627static int save_i387_xsave(void __user *buf)
628{
629 struct task_struct *tsk = current;
630 struct _fpstate_ia32 __user *fx = buf;
631 int err = 0;
632
633
634 sanitize_i387_state(tsk);
635
636 /*
637 * For legacy compatible, we always set FP/SSE bits in the bit
638 * vector while saving the state to the user context.
639 * This will enable us capturing any changes(during sigreturn) to
640 * the FP/SSE bits by the legacy applications which don't touch
641 * xstate_bv in the xsave header.
642 *
643 * xsave aware applications can change the xstate_bv in the xsave
644 * header as well as change any contents in the memory layout.
645 * xrestore as part of sigreturn will capture all the changes.
646 */
647 tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
648
649 if (save_i387_fxsave(fx) < 0)
650 return -1;
651
652 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32,
653 sizeof(struct _fpx_sw_bytes));
654 err |= __put_user(FP_XSTATE_MAGIC2,
655 (__u32 __user *) (buf + sig_xstate_ia32_size
656 - FP_XSTATE_MAGIC2_SIZE));
657 if (err)
658 return -1;
659
660 return 1;
661}
662
663int save_i387_xstate_ia32(void __user *buf)
664{
665 struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
666 struct task_struct *tsk = current;
667
668 if (!used_math())
669 return 0;
670
671 if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size))
672 return -EACCES;
673 /*
674 * This will cause a "finit" to be triggered by the next
675 * attempted FPU operation by the 'current' process.
676 */
677 clear_used_math();
678
679 if (!HAVE_HWFP) {
680 return fpregs_soft_get(current, NULL,
681 0, sizeof(struct user_i387_ia32_struct),
682 NULL, fp) ? -1 : 1;
683 }
684
685 unlazy_fpu(tsk);
686
687 if (cpu_has_xsave)
688 return save_i387_xsave(fp);
689 if (cpu_has_fxsr)
690 return save_i387_fxsave(fp);
691 else
692 return save_i387_fsave(fp);
693}
694
695static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
696{
697 struct task_struct *tsk = current;
698
699 return __copy_from_user(&tsk->thread.fpu.state->fsave, buf,
700 sizeof(struct i387_fsave_struct));
701}
702
703static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
704 unsigned int size)
705{
706 struct task_struct *tsk = current;
707 struct user_i387_ia32_struct env;
708 int err;
709
710 err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0],
711 size);
712 /* mxcsr reserved bits must be masked to zero for security reasons */
713 tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
714 if (err || __copy_from_user(&env, buf, sizeof(env)))
715 return 1;
716 convert_to_fxsr(tsk, &env);
717
718 return 0;
719}
720
721static int restore_i387_xsave(void __user *buf)
722{
723 struct _fpx_sw_bytes fx_sw_user;
724 struct _fpstate_ia32 __user *fx_user =
725 ((struct _fpstate_ia32 __user *) buf);
726 struct i387_fxsave_struct __user *fx =
727 (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
728 struct xsave_hdr_struct *xsave_hdr =
729 &current->thread.fpu.state->xsave.xsave_hdr;
730 u64 mask;
731 int err;
732
733 if (check_for_xstate(fx, buf, &fx_sw_user))
734 goto fx_only;
735
736 mask = fx_sw_user.xstate_bv;
737
738 err = restore_i387_fxsave(buf, fx_sw_user.xstate_size);
739
740 xsave_hdr->xstate_bv &= pcntxt_mask;
741 /*
742 * These bits must be zero.
743 */
744 xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
745
746 /*
747 * Init the state that is not present in the memory layout
748 * and enabled by the OS.
749 */
750 mask = ~(pcntxt_mask & ~mask);
751 xsave_hdr->xstate_bv &= mask;
752
753 return err;
754fx_only:
755 /*
756 * Couldn't find the extended state information in the memory
757 * layout. Restore the FP/SSE and init the other extended state
758 * enabled by the OS.
759 */
760 xsave_hdr->xstate_bv = XSTATE_FPSSE;
761 return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct));
762}
763
764int restore_i387_xstate_ia32(void __user *buf)
765{
766 int err;
767 struct task_struct *tsk = current;
768 struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
769
770 if (HAVE_HWFP)
771 clear_fpu(tsk);
772
773 if (!buf) {
774 if (used_math()) {
775 clear_fpu(tsk);
776 clear_used_math();
777 }
778
779 return 0;
780 } else
781 if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size))
782 return -EACCES;
783
784 if (!used_math()) {
785 err = init_fpu(tsk);
786 if (err)
787 return err;
788 }
789
790 if (HAVE_HWFP) {
791 if (cpu_has_xsave)
792 err = restore_i387_xsave(buf);
793 else if (cpu_has_fxsr)
794 err = restore_i387_fxsave(fp, sizeof(struct
795 i387_fxsave_struct));
796 else
797 err = restore_i387_fsave(fp);
798 } else {
799 err = fpregs_soft_set(current, NULL,
800 0, sizeof(struct user_i387_ia32_struct),
801 NULL, fp) != 0;
802 }
803 set_used_math();
804
805 return err;
806}
807
808/*
809 * FPU state for core dumps. 573 * FPU state for core dumps.
810 * This is only used for a.out dumps now. 574 * This is only used for a.out dumps now.
811 * It is declared generically using elf_fpregset_t (which is 575 * It is declared generically using elf_fpregset_t (which is
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 36d1853e91af..9a5c460404dc 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -263,7 +263,7 @@ static void i8259A_shutdown(void)
263 * out of. 263 * out of.
264 */ 264 */
265 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 265 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
266 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ 266 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
267} 267}
268 268
269static struct syscore_ops i8259_syscore_ops = { 269static struct syscore_ops i8259_syscore_ops = {
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d44f7829968e..e4595f105910 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -92,7 +92,8 @@ int arch_show_interrupts(struct seq_file *p, int prec)
92 seq_printf(p, " Rescheduling interrupts\n"); 92 seq_printf(p, " Rescheduling interrupts\n");
93 seq_printf(p, "%*s: ", prec, "CAL"); 93 seq_printf(p, "%*s: ", prec, "CAL");
94 for_each_online_cpu(j) 94 for_each_online_cpu(j)
95 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); 95 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
96 irq_stats(j)->irq_tlb_count);
96 seq_printf(p, " Function call interrupts\n"); 97 seq_printf(p, " Function call interrupts\n");
97 seq_printf(p, "%*s: ", prec, "TLB"); 98 seq_printf(p, "%*s: ", prec, "TLB");
98 for_each_online_cpu(j) 99 for_each_online_cpu(j)
@@ -147,7 +148,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
147#ifdef CONFIG_SMP 148#ifdef CONFIG_SMP
148 sum += irq_stats(cpu)->irq_resched_count; 149 sum += irq_stats(cpu)->irq_resched_count;
149 sum += irq_stats(cpu)->irq_call_count; 150 sum += irq_stats(cpu)->irq_call_count;
150 sum += irq_stats(cpu)->irq_tlb_count;
151#endif 151#endif
152#ifdef CONFIG_X86_THERMAL_VECTOR 152#ifdef CONFIG_X86_THERMAL_VECTOR
153 sum += irq_stats(cpu)->irq_thermal_count; 153 sum += irq_stats(cpu)->irq_thermal_count;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 3f61904365cf..836f8322960e 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -746,7 +746,9 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
746int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) 746int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
747{ 747{
748 int err; 748 int err;
749#ifdef CONFIG_DEBUG_RODATA
749 char opc[BREAK_INSTR_SIZE]; 750 char opc[BREAK_INSTR_SIZE];
751#endif /* CONFIG_DEBUG_RODATA */
750 752
751 bpt->type = BP_BREAKPOINT; 753 bpt->type = BP_BREAKPOINT;
752 err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, 754 err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index e2f751efb7b1..57916c0d3cf6 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -541,6 +541,23 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb
541 return 1; 541 return 1;
542} 542}
543 543
544#ifdef KPROBES_CAN_USE_FTRACE
545static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs,
546 struct kprobe_ctlblk *kcb)
547{
548 /*
549 * Emulate singlestep (and also recover regs->ip)
550 * as if there is a 5byte nop
551 */
552 regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
553 if (unlikely(p->post_handler)) {
554 kcb->kprobe_status = KPROBE_HIT_SSDONE;
555 p->post_handler(p, regs, 0);
556 }
557 __this_cpu_write(current_kprobe, NULL);
558}
559#endif
560
544/* 561/*
545 * Interrupts are disabled on entry as trap3 is an interrupt gate and they 562 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
546 * remain disabled throughout this function. 563 * remain disabled throughout this function.
@@ -599,6 +616,12 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
599 } else if (kprobe_running()) { 616 } else if (kprobe_running()) {
600 p = __this_cpu_read(current_kprobe); 617 p = __this_cpu_read(current_kprobe);
601 if (p->break_handler && p->break_handler(p, regs)) { 618 if (p->break_handler && p->break_handler(p, regs)) {
619#ifdef KPROBES_CAN_USE_FTRACE
620 if (kprobe_ftrace(p)) {
621 skip_singlestep(p, regs, kcb);
622 return 1;
623 }
624#endif
602 setup_singlestep(p, regs, kcb, 0); 625 setup_singlestep(p, regs, kcb, 0);
603 return 1; 626 return 1;
604 } 627 }
@@ -1052,6 +1075,50 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1052 return 0; 1075 return 0;
1053} 1076}
1054 1077
1078#ifdef KPROBES_CAN_USE_FTRACE
1079/* Ftrace callback handler for kprobes */
1080void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
1081 struct ftrace_ops *ops, struct pt_regs *regs)
1082{
1083 struct kprobe *p;
1084 struct kprobe_ctlblk *kcb;
1085 unsigned long flags;
1086
1087 /* Disable irq for emulating a breakpoint and avoiding preempt */
1088 local_irq_save(flags);
1089
1090 p = get_kprobe((kprobe_opcode_t *)ip);
1091 if (unlikely(!p) || kprobe_disabled(p))
1092 goto end;
1093
1094 kcb = get_kprobe_ctlblk();
1095 if (kprobe_running()) {
1096 kprobes_inc_nmissed_count(p);
1097 } else {
1098 /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
1099 regs->ip = ip + sizeof(kprobe_opcode_t);
1100
1101 __this_cpu_write(current_kprobe, p);
1102 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1103 if (!p->pre_handler || !p->pre_handler(p, regs))
1104 skip_singlestep(p, regs, kcb);
1105 /*
1106 * If pre_handler returns !0, it sets regs->ip and
1107 * resets current kprobe.
1108 */
1109 }
1110end:
1111 local_irq_restore(flags);
1112}
1113
1114int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p)
1115{
1116 p->ainsn.insn = NULL;
1117 p->ainsn.boostable = -1;
1118 return 0;
1119}
1120#endif
1121
1055int __init arch_init_kprobes(void) 1122int __init arch_init_kprobes(void)
1056{ 1123{
1057 return arch_init_optprobes(); 1124 return arch_init_optprobes();
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index c1d61ee4b4f1..b3e5e51bc907 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -354,6 +354,7 @@ static void kvm_pv_guest_cpu_reboot(void *unused)
354 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 354 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
355 wrmsrl(MSR_KVM_PV_EOI_EN, 0); 355 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
356 kvm_pv_disable_apf(); 356 kvm_pv_disable_apf();
357 kvm_disable_steal_time();
357} 358}
358 359
359static int kvm_pv_reboot_notify(struct notifier_block *nb, 360static int kvm_pv_reboot_notify(struct notifier_block *nb,
@@ -396,9 +397,7 @@ void kvm_disable_steal_time(void)
396#ifdef CONFIG_SMP 397#ifdef CONFIG_SMP
397static void __init kvm_smp_prepare_boot_cpu(void) 398static void __init kvm_smp_prepare_boot_cpu(void)
398{ 399{
399#ifdef CONFIG_KVM_CLOCK
400 WARN_ON(kvm_register_clock("primary cpu clock")); 400 WARN_ON(kvm_register_clock("primary cpu clock"));
401#endif
402 kvm_guest_cpu_init(); 401 kvm_guest_cpu_init();
403 native_smp_prepare_boot_cpu(); 402 native_smp_prepare_boot_cpu();
404} 403}
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 82746f942cd8..7720ff5a9ee2 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -75,20 +75,113 @@ struct microcode_amd {
75 75
76static struct equiv_cpu_entry *equiv_cpu_table; 76static struct equiv_cpu_entry *equiv_cpu_table;
77 77
78/* page-sized ucode patch buffer */ 78struct ucode_patch {
79void *patch; 79 struct list_head plist;
80 void *data;
81 u32 patch_id;
82 u16 equiv_cpu;
83};
84
85static LIST_HEAD(pcache);
86
87static u16 find_equiv_id(unsigned int cpu)
88{
89 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
90 int i = 0;
91
92 if (!equiv_cpu_table)
93 return 0;
94
95 while (equiv_cpu_table[i].installed_cpu != 0) {
96 if (uci->cpu_sig.sig == equiv_cpu_table[i].installed_cpu)
97 return equiv_cpu_table[i].equiv_cpu;
98
99 i++;
100 }
101 return 0;
102}
103
104static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu)
105{
106 int i = 0;
107
108 BUG_ON(!equiv_cpu_table);
109
110 while (equiv_cpu_table[i].equiv_cpu != 0) {
111 if (equiv_cpu == equiv_cpu_table[i].equiv_cpu)
112 return equiv_cpu_table[i].installed_cpu;
113 i++;
114 }
115 return 0;
116}
117
118/*
119 * a small, trivial cache of per-family ucode patches
120 */
121static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
122{
123 struct ucode_patch *p;
124
125 list_for_each_entry(p, &pcache, plist)
126 if (p->equiv_cpu == equiv_cpu)
127 return p;
128 return NULL;
129}
130
131static void update_cache(struct ucode_patch *new_patch)
132{
133 struct ucode_patch *p;
134
135 list_for_each_entry(p, &pcache, plist) {
136 if (p->equiv_cpu == new_patch->equiv_cpu) {
137 if (p->patch_id >= new_patch->patch_id)
138 /* we already have the latest patch */
139 return;
140
141 list_replace(&p->plist, &new_patch->plist);
142 kfree(p->data);
143 kfree(p);
144 return;
145 }
146 }
147 /* no patch found, add it */
148 list_add_tail(&new_patch->plist, &pcache);
149}
150
151static void free_cache(void)
152{
153 struct ucode_patch *p, *tmp;
154
155 list_for_each_entry_safe(p, tmp, &pcache, plist) {
156 __list_del(p->plist.prev, p->plist.next);
157 kfree(p->data);
158 kfree(p);
159 }
160}
161
162static struct ucode_patch *find_patch(unsigned int cpu)
163{
164 u16 equiv_id;
165
166 equiv_id = find_equiv_id(cpu);
167 if (!equiv_id)
168 return NULL;
169
170 return cache_find_patch(equiv_id);
171}
80 172
81static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 173static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
82{ 174{
83 struct cpuinfo_x86 *c = &cpu_data(cpu); 175 struct cpuinfo_x86 *c = &cpu_data(cpu);
84 176
177 csig->sig = cpuid_eax(0x00000001);
85 csig->rev = c->microcode; 178 csig->rev = c->microcode;
86 pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); 179 pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
87 180
88 return 0; 181 return 0;
89} 182}
90 183
91static unsigned int verify_ucode_size(int cpu, u32 patch_size, 184static unsigned int verify_patch_size(int cpu, u32 patch_size,
92 unsigned int size) 185 unsigned int size)
93{ 186{
94 struct cpuinfo_x86 *c = &cpu_data(cpu); 187 struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -118,95 +211,37 @@ static unsigned int verify_ucode_size(int cpu, u32 patch_size,
118 return patch_size; 211 return patch_size;
119} 212}
120 213
121static u16 find_equiv_id(void) 214static int apply_microcode_amd(int cpu)
122{ 215{
123 unsigned int current_cpu_id, i = 0; 216 struct cpuinfo_x86 *c = &cpu_data(cpu);
124 217 struct microcode_amd *mc_amd;
125 BUG_ON(equiv_cpu_table == NULL); 218 struct ucode_cpu_info *uci;
126 219 struct ucode_patch *p;
127 current_cpu_id = cpuid_eax(0x00000001); 220 u32 rev, dummy;
128
129 while (equiv_cpu_table[i].installed_cpu != 0) {
130 if (current_cpu_id == equiv_cpu_table[i].installed_cpu)
131 return equiv_cpu_table[i].equiv_cpu;
132
133 i++;
134 }
135 return 0;
136}
137 221
138/* 222 BUG_ON(raw_smp_processor_id() != cpu);
139 * we signal a good patch is found by returning its size > 0
140 */
141static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
142 unsigned int leftover_size, int rev,
143 unsigned int *current_size)
144{
145 struct microcode_header_amd *mc_hdr;
146 unsigned int actual_size, patch_size;
147 u16 equiv_cpu_id;
148 223
149 /* size of the current patch we're staring at */ 224 uci = ucode_cpu_info + cpu;
150 patch_size = *(u32 *)(ucode_ptr + 4);
151 *current_size = patch_size + SECTION_HDR_SIZE;
152 225
153 equiv_cpu_id = find_equiv_id(); 226 p = find_patch(cpu);
154 if (!equiv_cpu_id) 227 if (!p)
155 return 0; 228 return 0;
156 229
157 /* 230 mc_amd = p->data;
158 * let's look at the patch header itself now 231 uci->mc = p->data;
159 */
160 mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE);
161 232
162 if (mc_hdr->processor_rev_id != equiv_cpu_id) 233 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
163 return 0;
164 234
165 /* ucode might be chipset specific -- currently we don't support this */ 235 /* need to apply patch? */
166 if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { 236 if (rev >= mc_amd->hdr.patch_id) {
167 pr_err("CPU%d: chipset specific code not yet supported\n", 237 c->microcode = rev;
168 cpu);
169 return 0; 238 return 0;
170 } 239 }
171 240
172 if (mc_hdr->patch_id <= rev)
173 return 0;
174
175 /*
176 * now that the header looks sane, verify its size
177 */
178 actual_size = verify_ucode_size(cpu, patch_size, leftover_size);
179 if (!actual_size)
180 return 0;
181
182 /* clear the patch buffer */
183 memset(patch, 0, PAGE_SIZE);
184
185 /* all looks ok, get the binary patch */
186 get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size);
187
188 return actual_size;
189}
190
191static int apply_microcode_amd(int cpu)
192{
193 u32 rev, dummy;
194 int cpu_num = raw_smp_processor_id();
195 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
196 struct microcode_amd *mc_amd = uci->mc;
197 struct cpuinfo_x86 *c = &cpu_data(cpu);
198
199 /* We should bind the task to the CPU */
200 BUG_ON(cpu_num != cpu);
201
202 if (mc_amd == NULL)
203 return 0;
204
205 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); 241 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
206 /* get patch id after patching */
207 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
208 242
209 /* check current patch id and patch's id for match */ 243 /* verify patch application was successful */
244 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
210 if (rev != mc_amd->hdr.patch_id) { 245 if (rev != mc_amd->hdr.patch_id) {
211 pr_err("CPU%d: update failed for patch_level=0x%08x\n", 246 pr_err("CPU%d: update failed for patch_level=0x%08x\n",
212 cpu, mc_amd->hdr.patch_id); 247 cpu, mc_amd->hdr.patch_id);
@@ -238,7 +273,7 @@ static int install_equiv_cpu_table(const u8 *buf)
238 return -ENOMEM; 273 return -ENOMEM;
239 } 274 }
240 275
241 get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size); 276 memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
242 277
243 /* add header length */ 278 /* add header length */
244 return size + CONTAINER_HDR_SZ; 279 return size + CONTAINER_HDR_SZ;
@@ -250,61 +285,113 @@ static void free_equiv_cpu_table(void)
250 equiv_cpu_table = NULL; 285 equiv_cpu_table = NULL;
251} 286}
252 287
253static enum ucode_state 288static void cleanup(void)
254generic_load_microcode(int cpu, const u8 *data, size_t size)
255{ 289{
256 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 290 free_equiv_cpu_table();
257 struct microcode_header_amd *mc_hdr = NULL; 291 free_cache();
258 unsigned int mc_size, leftover, current_size = 0; 292}
293
294/*
295 * We return the current size even if some of the checks failed so that
296 * we can skip over the next patch. If we return a negative value, we
297 * signal a grave error like a memory allocation has failed and the
298 * driver cannot continue functioning normally. In such cases, we tear
299 * down everything we've used up so far and exit.
300 */
301static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover)
302{
303 struct cpuinfo_x86 *c = &cpu_data(cpu);
304 struct microcode_header_amd *mc_hdr;
305 struct ucode_patch *patch;
306 unsigned int patch_size, crnt_size, ret;
307 u32 proc_fam;
308 u16 proc_id;
309
310 patch_size = *(u32 *)(fw + 4);
311 crnt_size = patch_size + SECTION_HDR_SIZE;
312 mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
313 proc_id = mc_hdr->processor_rev_id;
314
315 proc_fam = find_cpu_family_by_equiv_cpu(proc_id);
316 if (!proc_fam) {
317 pr_err("No patch family for equiv ID: 0x%04x\n", proc_id);
318 return crnt_size;
319 }
320
321 /* check if patch is for the current family */
322 proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff);
323 if (proc_fam != c->x86)
324 return crnt_size;
325
326 if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
327 pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n",
328 mc_hdr->patch_id);
329 return crnt_size;
330 }
331
332 ret = verify_patch_size(cpu, patch_size, leftover);
333 if (!ret) {
334 pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id);
335 return crnt_size;
336 }
337
338 patch = kzalloc(sizeof(*patch), GFP_KERNEL);
339 if (!patch) {
340 pr_err("Patch allocation failure.\n");
341 return -EINVAL;
342 }
343
344 patch->data = kzalloc(patch_size, GFP_KERNEL);
345 if (!patch->data) {
346 pr_err("Patch data allocation failure.\n");
347 kfree(patch);
348 return -EINVAL;
349 }
350
351 /* All looks ok, copy patch... */
352 memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size);
353 INIT_LIST_HEAD(&patch->plist);
354 patch->patch_id = mc_hdr->patch_id;
355 patch->equiv_cpu = proc_id;
356
357 /* ... and add to cache. */
358 update_cache(patch);
359
360 return crnt_size;
361}
362
363static enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size)
364{
365 enum ucode_state ret = UCODE_ERROR;
366 unsigned int leftover;
367 u8 *fw = (u8 *)data;
368 int crnt_size = 0;
259 int offset; 369 int offset;
260 const u8 *ucode_ptr = data;
261 void *new_mc = NULL;
262 unsigned int new_rev = uci->cpu_sig.rev;
263 enum ucode_state state = UCODE_ERROR;
264 370
265 offset = install_equiv_cpu_table(ucode_ptr); 371 offset = install_equiv_cpu_table(data);
266 if (offset < 0) { 372 if (offset < 0) {
267 pr_err("failed to create equivalent cpu table\n"); 373 pr_err("failed to create equivalent cpu table\n");
268 goto out; 374 return ret;
269 } 375 }
270 ucode_ptr += offset; 376 fw += offset;
271 leftover = size - offset; 377 leftover = size - offset;
272 378
273 if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) { 379 if (*(u32 *)fw != UCODE_UCODE_TYPE) {
274 pr_err("invalid type field in container file section header\n"); 380 pr_err("invalid type field in container file section header\n");
275 goto free_table; 381 free_equiv_cpu_table();
382 return ret;
276 } 383 }
277 384
278 while (leftover) { 385 while (leftover) {
279 mc_size = get_matching_microcode(cpu, ucode_ptr, leftover, 386 crnt_size = verify_and_add_patch(cpu, fw, leftover);
280 new_rev, &current_size); 387 if (crnt_size < 0)
281 if (mc_size) { 388 return ret;
282 mc_hdr = patch;
283 new_mc = patch;
284 new_rev = mc_hdr->patch_id;
285 goto out_ok;
286 }
287
288 ucode_ptr += current_size;
289 leftover -= current_size;
290 }
291 389
292 if (!new_mc) { 390 fw += crnt_size;
293 state = UCODE_NFOUND; 391 leftover -= crnt_size;
294 goto free_table;
295 } 392 }
296 393
297out_ok: 394 return UCODE_OK;
298 uci->mc = new_mc;
299 state = UCODE_OK;
300 pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
301 cpu, uci->cpu_sig.rev, new_rev);
302
303free_table:
304 free_equiv_cpu_table();
305
306out:
307 return state;
308} 395}
309 396
310/* 397/*
@@ -315,7 +402,7 @@ out:
315 * 402 *
316 * This legacy file is always smaller than 2K in size. 403 * This legacy file is always smaller than 2K in size.
317 * 404 *
318 * Starting at family 15h they are in family specific firmware files: 405 * Beginning with family 15h, they are in family-specific firmware files:
319 * 406 *
320 * amd-ucode/microcode_amd_fam15h.bin 407 * amd-ucode/microcode_amd_fam15h.bin
321 * amd-ucode/microcode_amd_fam16h.bin 408 * amd-ucode/microcode_amd_fam16h.bin
@@ -323,12 +410,17 @@ out:
323 * 410 *
324 * These might be larger than 2K. 411 * These might be larger than 2K.
325 */ 412 */
326static enum ucode_state request_microcode_amd(int cpu, struct device *device) 413static enum ucode_state request_microcode_amd(int cpu, struct device *device,
414 bool refresh_fw)
327{ 415{
328 char fw_name[36] = "amd-ucode/microcode_amd.bin"; 416 char fw_name[36] = "amd-ucode/microcode_amd.bin";
329 const struct firmware *fw;
330 enum ucode_state ret = UCODE_NFOUND;
331 struct cpuinfo_x86 *c = &cpu_data(cpu); 417 struct cpuinfo_x86 *c = &cpu_data(cpu);
418 enum ucode_state ret = UCODE_NFOUND;
419 const struct firmware *fw;
420
421 /* reload ucode container only on the boot cpu */
422 if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
423 return UCODE_OK;
332 424
333 if (c->x86 >= 0x15) 425 if (c->x86 >= 0x15)
334 snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); 426 snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
@@ -344,12 +436,17 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device)
344 goto fw_release; 436 goto fw_release;
345 } 437 }
346 438
347 ret = generic_load_microcode(cpu, fw->data, fw->size); 439 /* free old equiv table */
440 free_equiv_cpu_table();
441
442 ret = load_microcode_amd(cpu, fw->data, fw->size);
443 if (ret != UCODE_OK)
444 cleanup();
348 445
349fw_release: 446 fw_release:
350 release_firmware(fw); 447 release_firmware(fw);
351 448
352out: 449 out:
353 return ret; 450 return ret;
354} 451}
355 452
@@ -383,14 +480,10 @@ struct microcode_ops * __init init_amd_microcode(void)
383 return NULL; 480 return NULL;
384 } 481 }
385 482
386 patch = (void *)get_zeroed_page(GFP_KERNEL);
387 if (!patch)
388 return NULL;
389
390 return &microcode_amd_ops; 483 return &microcode_amd_ops;
391} 484}
392 485
393void __exit exit_amd_microcode(void) 486void __exit exit_amd_microcode(void)
394{ 487{
395 free_page((unsigned long)patch); 488 cleanup();
396} 489}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 4873e62db6a1..3a04b224d0c0 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -225,6 +225,9 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
225 if (do_microcode_update(buf, len) == 0) 225 if (do_microcode_update(buf, len) == 0)
226 ret = (ssize_t)len; 226 ret = (ssize_t)len;
227 227
228 if (ret > 0)
229 perf_check_microcode();
230
228 mutex_unlock(&microcode_mutex); 231 mutex_unlock(&microcode_mutex);
229 put_online_cpus(); 232 put_online_cpus();
230 233
@@ -276,19 +279,18 @@ static struct platform_device *microcode_pdev;
276static int reload_for_cpu(int cpu) 279static int reload_for_cpu(int cpu)
277{ 280{
278 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 281 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
282 enum ucode_state ustate;
279 int err = 0; 283 int err = 0;
280 284
281 if (uci->valid) { 285 if (!uci->valid)
282 enum ucode_state ustate; 286 return err;
283
284 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
285 if (ustate == UCODE_OK)
286 apply_microcode_on_target(cpu);
287 else
288 if (ustate == UCODE_ERROR)
289 err = -EINVAL;
290 }
291 287
288 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, true);
289 if (ustate == UCODE_OK)
290 apply_microcode_on_target(cpu);
291 else
292 if (ustate == UCODE_ERROR)
293 err = -EINVAL;
292 return err; 294 return err;
293} 295}
294 296
@@ -370,18 +372,15 @@ static void microcode_fini_cpu(int cpu)
370 372
371static enum ucode_state microcode_resume_cpu(int cpu) 373static enum ucode_state microcode_resume_cpu(int cpu)
372{ 374{
373 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
374
375 if (!uci->mc)
376 return UCODE_NFOUND;
377
378 pr_debug("CPU%d updated upon resume\n", cpu); 375 pr_debug("CPU%d updated upon resume\n", cpu);
379 apply_microcode_on_target(cpu); 376
377 if (apply_microcode_on_target(cpu))
378 return UCODE_ERROR;
380 379
381 return UCODE_OK; 380 return UCODE_OK;
382} 381}
383 382
384static enum ucode_state microcode_init_cpu(int cpu) 383static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
385{ 384{
386 enum ucode_state ustate; 385 enum ucode_state ustate;
387 386
@@ -392,7 +391,8 @@ static enum ucode_state microcode_init_cpu(int cpu)
392 if (system_state != SYSTEM_RUNNING) 391 if (system_state != SYSTEM_RUNNING)
393 return UCODE_NFOUND; 392 return UCODE_NFOUND;
394 393
395 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev); 394 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev,
395 refresh_fw);
396 396
397 if (ustate == UCODE_OK) { 397 if (ustate == UCODE_OK) {
398 pr_debug("CPU%d updated upon init\n", cpu); 398 pr_debug("CPU%d updated upon init\n", cpu);
@@ -405,14 +405,11 @@ static enum ucode_state microcode_init_cpu(int cpu)
405static enum ucode_state microcode_update_cpu(int cpu) 405static enum ucode_state microcode_update_cpu(int cpu)
406{ 406{
407 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 407 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
408 enum ucode_state ustate;
409 408
410 if (uci->valid) 409 if (uci->valid)
411 ustate = microcode_resume_cpu(cpu); 410 return microcode_resume_cpu(cpu);
412 else
413 ustate = microcode_init_cpu(cpu);
414 411
415 return ustate; 412 return microcode_init_cpu(cpu, false);
416} 413}
417 414
418static int mc_device_add(struct device *dev, struct subsys_interface *sif) 415static int mc_device_add(struct device *dev, struct subsys_interface *sif)
@@ -428,7 +425,7 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif)
428 if (err) 425 if (err)
429 return err; 426 return err;
430 427
431 if (microcode_init_cpu(cpu) == UCODE_ERROR) 428 if (microcode_init_cpu(cpu, true) == UCODE_ERROR)
432 return -EINVAL; 429 return -EINVAL;
433 430
434 return err; 431 return err;
@@ -477,34 +474,41 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
477 struct device *dev; 474 struct device *dev;
478 475
479 dev = get_cpu_device(cpu); 476 dev = get_cpu_device(cpu);
480 switch (action) { 477
478 switch (action & ~CPU_TASKS_FROZEN) {
481 case CPU_ONLINE: 479 case CPU_ONLINE:
482 case CPU_ONLINE_FROZEN:
483 microcode_update_cpu(cpu); 480 microcode_update_cpu(cpu);
484 case CPU_DOWN_FAILED:
485 case CPU_DOWN_FAILED_FROZEN:
486 pr_debug("CPU%d added\n", cpu); 481 pr_debug("CPU%d added\n", cpu);
482 /*
483 * "break" is missing on purpose here because we want to fall
484 * through in order to create the sysfs group.
485 */
486
487 case CPU_DOWN_FAILED:
487 if (sysfs_create_group(&dev->kobj, &mc_attr_group)) 488 if (sysfs_create_group(&dev->kobj, &mc_attr_group))
488 pr_err("Failed to create group for CPU%d\n", cpu); 489 pr_err("Failed to create group for CPU%d\n", cpu);
489 break; 490 break;
491
490 case CPU_DOWN_PREPARE: 492 case CPU_DOWN_PREPARE:
491 case CPU_DOWN_PREPARE_FROZEN:
492 /* Suspend is in progress, only remove the interface */ 493 /* Suspend is in progress, only remove the interface */
493 sysfs_remove_group(&dev->kobj, &mc_attr_group); 494 sysfs_remove_group(&dev->kobj, &mc_attr_group);
494 pr_debug("CPU%d removed\n", cpu); 495 pr_debug("CPU%d removed\n", cpu);
495 break; 496 break;
496 497
497 /* 498 /*
499 * case CPU_DEAD:
500 *
498 * When a CPU goes offline, don't free up or invalidate the copy of 501 * When a CPU goes offline, don't free up or invalidate the copy of
499 * the microcode in kernel memory, so that we can reuse it when the 502 * the microcode in kernel memory, so that we can reuse it when the
500 * CPU comes back online without unnecessarily requesting the userspace 503 * CPU comes back online without unnecessarily requesting the userspace
501 * for it again. 504 * for it again.
502 */ 505 */
503 case CPU_UP_CANCELED_FROZEN:
504 /* The CPU refused to come up during a system resume */
505 microcode_fini_cpu(cpu);
506 break;
507 } 506 }
507
508 /* The CPU refused to come up during a system resume */
509 if (action == CPU_UP_CANCELED_FROZEN)
510 microcode_fini_cpu(cpu);
511
508 return NOTIFY_OK; 512 return NOTIFY_OK;
509} 513}
510 514
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 0327e2b3c408..3544aed39338 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -405,7 +405,8 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
405 return 0; 405 return 0;
406} 406}
407 407
408static enum ucode_state request_microcode_fw(int cpu, struct device *device) 408static enum ucode_state request_microcode_fw(int cpu, struct device *device,
409 bool refresh_fw)
409{ 410{
410 char name[30]; 411 char name[30];
411 struct cpuinfo_x86 *c = &cpu_data(cpu); 412 struct cpuinfo_x86 *c = &cpu_data(cpu);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index eb113693f043..a7c5661f8496 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -257,12 +257,14 @@ static int __init msr_init(void)
257 goto out_chrdev; 257 goto out_chrdev;
258 } 258 }
259 msr_class->devnode = msr_devnode; 259 msr_class->devnode = msr_devnode;
260 get_online_cpus();
260 for_each_online_cpu(i) { 261 for_each_online_cpu(i) {
261 err = msr_device_create(i); 262 err = msr_device_create(i);
262 if (err != 0) 263 if (err != 0)
263 goto out_class; 264 goto out_class;
264 } 265 }
265 register_hotcpu_notifier(&msr_class_cpu_notifier); 266 register_hotcpu_notifier(&msr_class_cpu_notifier);
267 put_online_cpus();
266 268
267 err = 0; 269 err = 0;
268 goto out; 270 goto out;
@@ -271,6 +273,7 @@ out_class:
271 i = 0; 273 i = 0;
272 for_each_online_cpu(i) 274 for_each_online_cpu(i)
273 msr_device_destroy(i); 275 msr_device_destroy(i);
276 put_online_cpus();
274 class_destroy(msr_class); 277 class_destroy(msr_class);
275out_chrdev: 278out_chrdev:
276 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); 279 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
@@ -281,11 +284,13 @@ out:
281static void __exit msr_exit(void) 284static void __exit msr_exit(void)
282{ 285{
283 int cpu = 0; 286 int cpu = 0;
287 get_online_cpus();
284 for_each_online_cpu(cpu) 288 for_each_online_cpu(cpu)
285 msr_device_destroy(cpu); 289 msr_device_destroy(cpu);
286 class_destroy(msr_class); 290 class_destroy(msr_class);
287 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); 291 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
288 unregister_hotcpu_notifier(&msr_class_cpu_notifier); 292 unregister_hotcpu_notifier(&msr_class_cpu_notifier);
293 put_online_cpus();
289} 294}
290 295
291module_init(msr_init); 296module_init(msr_init);
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
new file mode 100644
index 000000000000..e309cc5c276e
--- /dev/null
+++ b/arch/x86/kernel/perf_regs.c
@@ -0,0 +1,105 @@
1#include <linux/errno.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
4#include <linux/perf_event.h>
5#include <linux/bug.h>
6#include <linux/stddef.h>
7#include <asm/perf_regs.h>
8#include <asm/ptrace.h>
9
10#ifdef CONFIG_X86_32
11#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX
12#else
13#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX
14#endif
15
16#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
17
18static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = {
19 PT_REGS_OFFSET(PERF_REG_X86_AX, ax),
20 PT_REGS_OFFSET(PERF_REG_X86_BX, bx),
21 PT_REGS_OFFSET(PERF_REG_X86_CX, cx),
22 PT_REGS_OFFSET(PERF_REG_X86_DX, dx),
23 PT_REGS_OFFSET(PERF_REG_X86_SI, si),
24 PT_REGS_OFFSET(PERF_REG_X86_DI, di),
25 PT_REGS_OFFSET(PERF_REG_X86_BP, bp),
26 PT_REGS_OFFSET(PERF_REG_X86_SP, sp),
27 PT_REGS_OFFSET(PERF_REG_X86_IP, ip),
28 PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags),
29 PT_REGS_OFFSET(PERF_REG_X86_CS, cs),
30 PT_REGS_OFFSET(PERF_REG_X86_SS, ss),
31#ifdef CONFIG_X86_32
32 PT_REGS_OFFSET(PERF_REG_X86_DS, ds),
33 PT_REGS_OFFSET(PERF_REG_X86_ES, es),
34 PT_REGS_OFFSET(PERF_REG_X86_FS, fs),
35 PT_REGS_OFFSET(PERF_REG_X86_GS, gs),
36#else
37 /*
38 * The pt_regs struct does not store
39 * ds, es, fs, gs in 64 bit mode.
40 */
41 (unsigned int) -1,
42 (unsigned int) -1,
43 (unsigned int) -1,
44 (unsigned int) -1,
45#endif
46#ifdef CONFIG_X86_64
47 PT_REGS_OFFSET(PERF_REG_X86_R8, r8),
48 PT_REGS_OFFSET(PERF_REG_X86_R9, r9),
49 PT_REGS_OFFSET(PERF_REG_X86_R10, r10),
50 PT_REGS_OFFSET(PERF_REG_X86_R11, r11),
51 PT_REGS_OFFSET(PERF_REG_X86_R12, r12),
52 PT_REGS_OFFSET(PERF_REG_X86_R13, r13),
53 PT_REGS_OFFSET(PERF_REG_X86_R14, r14),
54 PT_REGS_OFFSET(PERF_REG_X86_R15, r15),
55#endif
56};
57
58u64 perf_reg_value(struct pt_regs *regs, int idx)
59{
60 if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset)))
61 return 0;
62
63 return regs_get_register(regs, pt_regs_offset[idx]);
64}
65
66#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL))
67
68#ifdef CONFIG_X86_32
69int perf_reg_validate(u64 mask)
70{
71 if (!mask || mask & REG_RESERVED)
72 return -EINVAL;
73
74 return 0;
75}
76
77u64 perf_reg_abi(struct task_struct *task)
78{
79 return PERF_SAMPLE_REGS_ABI_32;
80}
81#else /* CONFIG_X86_64 */
82#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \
83 (1ULL << PERF_REG_X86_ES) | \
84 (1ULL << PERF_REG_X86_FS) | \
85 (1ULL << PERF_REG_X86_GS))
86
87int perf_reg_validate(u64 mask)
88{
89 if (!mask || mask & REG_RESERVED)
90 return -EINVAL;
91
92 if (mask & REG_NOSUPPORT)
93 return -EINVAL;
94
95 return 0;
96}
97
98u64 perf_reg_abi(struct task_struct *task)
99{
100 if (test_tsk_thread_flag(task, TIF_IA32))
101 return PERF_SAMPLE_REGS_ABI_32;
102 else
103 return PERF_SAMPLE_REGS_ABI_64;
104}
105#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 0bc72e2069e3..d5f15c3f7b25 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -150,7 +150,7 @@ static struct resource *find_oprom(struct pci_dev *pdev)
150 return oprom; 150 return oprom;
151} 151}
152 152
153void *pci_map_biosrom(struct pci_dev *pdev) 153void __iomem *pci_map_biosrom(struct pci_dev *pdev)
154{ 154{
155 struct resource *oprom = find_oprom(pdev); 155 struct resource *oprom = find_oprom(pdev);
156 156
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ef6a8456f719..b644e1c765dc 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -66,15 +66,13 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
66{ 66{
67 int ret; 67 int ret;
68 68
69 unlazy_fpu(src);
70
71 *dst = *src; 69 *dst = *src;
72 if (fpu_allocated(&src->thread.fpu)) { 70 if (fpu_allocated(&src->thread.fpu)) {
73 memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); 71 memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
74 ret = fpu_alloc(&dst->thread.fpu); 72 ret = fpu_alloc(&dst->thread.fpu);
75 if (ret) 73 if (ret)
76 return ret; 74 return ret;
77 fpu_copy(&dst->thread.fpu, &src->thread.fpu); 75 fpu_copy(dst, src);
78 } 76 }
79 return 0; 77 return 0;
80} 78}
@@ -97,16 +95,6 @@ void arch_task_cache_init(void)
97 SLAB_PANIC | SLAB_NOTRACK, NULL); 95 SLAB_PANIC | SLAB_NOTRACK, NULL);
98} 96}
99 97
100static inline void drop_fpu(struct task_struct *tsk)
101{
102 /*
103 * Forget coprocessor state..
104 */
105 tsk->fpu_counter = 0;
106 clear_fpu(tsk);
107 clear_used_math();
108}
109
110/* 98/*
111 * Free current thread data structures etc.. 99 * Free current thread data structures etc..
112 */ 100 */
@@ -163,7 +151,13 @@ void flush_thread(void)
163 151
164 flush_ptrace_hw_breakpoint(tsk); 152 flush_ptrace_hw_breakpoint(tsk);
165 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 153 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
166 drop_fpu(tsk); 154 drop_init_fpu(tsk);
155 /*
156 * Free the FPU state for non xsave platforms. They get reallocated
157 * lazily at the first use.
158 */
159 if (!use_eager_fpu())
160 free_thread_xstate(tsk);
167} 161}
168 162
169static void hard_disable_TSC(void) 163static void hard_disable_TSC(void)
@@ -299,71 +293,6 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
299} 293}
300 294
301/* 295/*
302 * This gets run with %si containing the
303 * function to call, and %di containing
304 * the "args".
305 */
306extern void kernel_thread_helper(void);
307
308/*
309 * Create a kernel thread
310 */
311int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
312{
313 struct pt_regs regs;
314
315 memset(&regs, 0, sizeof(regs));
316
317 regs.si = (unsigned long) fn;
318 regs.di = (unsigned long) arg;
319
320#ifdef CONFIG_X86_32
321 regs.ds = __USER_DS;
322 regs.es = __USER_DS;
323 regs.fs = __KERNEL_PERCPU;
324 regs.gs = __KERNEL_STACK_CANARY;
325#else
326 regs.ss = __KERNEL_DS;
327#endif
328
329 regs.orig_ax = -1;
330 regs.ip = (unsigned long) kernel_thread_helper;
331 regs.cs = __KERNEL_CS | get_kernel_rpl();
332 regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
333
334 /* Ok, create the new process.. */
335 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
336}
337EXPORT_SYMBOL(kernel_thread);
338
339/*
340 * sys_execve() executes a new program.
341 */
342long sys_execve(const char __user *name,
343 const char __user *const __user *argv,
344 const char __user *const __user *envp, struct pt_regs *regs)
345{
346 long error;
347 char *filename;
348
349 filename = getname(name);
350 error = PTR_ERR(filename);
351 if (IS_ERR(filename))
352 return error;
353 error = do_execve(filename, argv, envp, regs);
354
355#ifdef CONFIG_X86_32
356 if (error == 0) {
357 /* Make sure we don't return using sysenter.. */
358 set_thread_flag(TIF_IRET);
359 }
360#endif
361
362 putname(filename);
363 return error;
364}
365
366/*
367 * Idle related variables and functions 296 * Idle related variables and functions
368 */ 297 */
369unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; 298unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 516fa186121b..44e0bff38e72 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,6 +57,7 @@
57#include <asm/switch_to.h> 57#include <asm/switch_to.h>
58 58
59asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 59asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
60asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
60 61
61/* 62/*
62 * Return saved PC of a blocked thread. 63 * Return saved PC of a blocked thread.
@@ -127,23 +128,39 @@ void release_thread(struct task_struct *dead_task)
127} 128}
128 129
129int copy_thread(unsigned long clone_flags, unsigned long sp, 130int copy_thread(unsigned long clone_flags, unsigned long sp,
130 unsigned long unused, 131 unsigned long arg,
131 struct task_struct *p, struct pt_regs *regs) 132 struct task_struct *p, struct pt_regs *regs)
132{ 133{
133 struct pt_regs *childregs; 134 struct pt_regs *childregs = task_pt_regs(p);
134 struct task_struct *tsk; 135 struct task_struct *tsk;
135 int err; 136 int err;
136 137
137 childregs = task_pt_regs(p); 138 p->thread.sp = (unsigned long) childregs;
139 p->thread.sp0 = (unsigned long) (childregs+1);
140
141 if (unlikely(!regs)) {
142 /* kernel thread */
143 memset(childregs, 0, sizeof(struct pt_regs));
144 p->thread.ip = (unsigned long) ret_from_kernel_thread;
145 task_user_gs(p) = __KERNEL_STACK_CANARY;
146 childregs->ds = __USER_DS;
147 childregs->es = __USER_DS;
148 childregs->fs = __KERNEL_PERCPU;
149 childregs->bx = sp; /* function */
150 childregs->bp = arg;
151 childregs->orig_ax = -1;
152 childregs->cs = __KERNEL_CS | get_kernel_rpl();
153 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
154 p->fpu_counter = 0;
155 p->thread.io_bitmap_ptr = NULL;
156 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
157 return 0;
158 }
138 *childregs = *regs; 159 *childregs = *regs;
139 childregs->ax = 0; 160 childregs->ax = 0;
140 childregs->sp = sp; 161 childregs->sp = sp;
141 162
142 p->thread.sp = (unsigned long) childregs;
143 p->thread.sp0 = (unsigned long) (childregs+1);
144
145 p->thread.ip = (unsigned long) ret_from_fork; 163 p->thread.ip = (unsigned long) ret_from_fork;
146
147 task_user_gs(p) = get_user_gs(regs); 164 task_user_gs(p) = get_user_gs(regs);
148 165
149 p->fpu_counter = 0; 166 p->fpu_counter = 0;
@@ -190,10 +207,12 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
190 regs->cs = __USER_CS; 207 regs->cs = __USER_CS;
191 regs->ip = new_ip; 208 regs->ip = new_ip;
192 regs->sp = new_sp; 209 regs->sp = new_sp;
210 regs->flags = X86_EFLAGS_IF;
193 /* 211 /*
194 * Free the old FP and other extended state 212 * force it to the iret return path by making it look as if there was
213 * some work pending.
195 */ 214 */
196 free_thread_xstate(current); 215 set_thread_flag(TIF_NOTIFY_RESUME);
197} 216}
198EXPORT_SYMBOL_GPL(start_thread); 217EXPORT_SYMBOL_GPL(start_thread);
199 218
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 0a980c9d7cb8..16c6365e2b86 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -146,29 +146,18 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls)
146} 146}
147 147
148int copy_thread(unsigned long clone_flags, unsigned long sp, 148int copy_thread(unsigned long clone_flags, unsigned long sp,
149 unsigned long unused, 149 unsigned long arg,
150 struct task_struct *p, struct pt_regs *regs) 150 struct task_struct *p, struct pt_regs *regs)
151{ 151{
152 int err; 152 int err;
153 struct pt_regs *childregs; 153 struct pt_regs *childregs;
154 struct task_struct *me = current; 154 struct task_struct *me = current;
155 155
156 childregs = ((struct pt_regs *) 156 p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
157 (THREAD_SIZE + task_stack_page(p))) - 1; 157 childregs = task_pt_regs(p);
158 *childregs = *regs;
159
160 childregs->ax = 0;
161 if (user_mode(regs))
162 childregs->sp = sp;
163 else
164 childregs->sp = (unsigned long)childregs;
165
166 p->thread.sp = (unsigned long) childregs; 158 p->thread.sp = (unsigned long) childregs;
167 p->thread.sp0 = (unsigned long) (childregs+1);
168 p->thread.usersp = me->thread.usersp; 159 p->thread.usersp = me->thread.usersp;
169
170 set_tsk_thread_flag(p, TIF_FORK); 160 set_tsk_thread_flag(p, TIF_FORK);
171
172 p->fpu_counter = 0; 161 p->fpu_counter = 0;
173 p->thread.io_bitmap_ptr = NULL; 162 p->thread.io_bitmap_ptr = NULL;
174 163
@@ -178,6 +167,24 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
178 p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs; 167 p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
179 savesegment(es, p->thread.es); 168 savesegment(es, p->thread.es);
180 savesegment(ds, p->thread.ds); 169 savesegment(ds, p->thread.ds);
170 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
171
172 if (unlikely(!regs)) {
173 /* kernel thread */
174 memset(childregs, 0, sizeof(struct pt_regs));
175 childregs->sp = (unsigned long)childregs;
176 childregs->ss = __KERNEL_DS;
177 childregs->bx = sp; /* function */
178 childregs->bp = arg;
179 childregs->orig_ax = -1;
180 childregs->cs = __KERNEL_CS | get_kernel_rpl();
181 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
182 return 0;
183 }
184 *childregs = *regs;
185
186 childregs->ax = 0;
187 childregs->sp = sp;
181 188
182 err = -ENOMEM; 189 err = -ENOMEM;
183 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 190 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
@@ -232,10 +239,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
232 regs->cs = _cs; 239 regs->cs = _cs;
233 regs->ss = _ss; 240 regs->ss = _ss;
234 regs->flags = X86_EFLAGS_IF; 241 regs->flags = X86_EFLAGS_IF;
235 /*
236 * Free the old FP and other extended state
237 */
238 free_thread_xstate(current);
239} 242}
240 243
241void 244void
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index c4c6a5c2bf0f..b00b33a18390 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
21#include <linux/signal.h> 21#include <linux/signal.h>
22#include <linux/perf_event.h> 22#include <linux/perf_event.h>
23#include <linux/hw_breakpoint.h> 23#include <linux/hw_breakpoint.h>
24#include <linux/rcupdate.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include <asm/pgtable.h> 27#include <asm/pgtable.h>
@@ -1332,9 +1333,6 @@ static const struct user_regset_view user_x86_64_view = {
1332#define genregs32_get genregs_get 1333#define genregs32_get genregs_get
1333#define genregs32_set genregs_set 1334#define genregs32_set genregs_set
1334 1335
1335#define user_i387_ia32_struct user_i387_struct
1336#define user32_fxsr_struct user_fxsr_struct
1337
1338#endif /* CONFIG_X86_64 */ 1336#endif /* CONFIG_X86_64 */
1339 1337
1340#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 1338#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1463,6 +1461,8 @@ long syscall_trace_enter(struct pt_regs *regs)
1463{ 1461{
1464 long ret = 0; 1462 long ret = 0;
1465 1463
1464 rcu_user_exit();
1465
1466 /* 1466 /*
1467 * If we stepped into a sysenter/syscall insn, it trapped in 1467 * If we stepped into a sysenter/syscall insn, it trapped in
1468 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. 1468 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
@@ -1526,4 +1526,6 @@ void syscall_trace_leave(struct pt_regs *regs)
1526 !test_thread_flag(TIF_SYSCALL_EMU); 1526 !test_thread_flag(TIF_SYSCALL_EMU);
1527 if (step || test_thread_flag(TIF_SYSCALL_TRACE)) 1527 if (step || test_thread_flag(TIF_SYSCALL_TRACE))
1528 tracehook_report_syscall_exit(regs, step); 1528 tracehook_report_syscall_exit(regs, step);
1529
1530 rcu_user_enter();
1529} 1531}
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index af6db6ec5b2a..4929c1be0ac0 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -225,7 +225,7 @@ static struct platform_device rtc_device = {
225static __init int add_rtc_cmos(void) 225static __init int add_rtc_cmos(void)
226{ 226{
227#ifdef CONFIG_PNP 227#ifdef CONFIG_PNP
228 static const char *ids[] __initconst = 228 static const char * const const ids[] __initconst =
229 { "PNP0b00", "PNP0b01", "PNP0b02", }; 229 { "PNP0b00", "PNP0b01", "PNP0b02", };
230 struct pnp_dev *dev; 230 struct pnp_dev *dev;
231 struct pnp_id *id; 231 struct pnp_id *id;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4f165479c453..a2bb18e02839 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -68,6 +68,7 @@
68#include <linux/percpu.h> 68#include <linux/percpu.h>
69#include <linux/crash_dump.h> 69#include <linux/crash_dump.h>
70#include <linux/tboot.h> 70#include <linux/tboot.h>
71#include <linux/jiffies.h>
71 72
72#include <video/edid.h> 73#include <video/edid.h>
73 74
@@ -957,7 +958,7 @@ void __init setup_arch(char **cmdline_p)
957 initmem_init(); 958 initmem_init();
958 memblock_find_dma_reserve(); 959 memblock_find_dma_reserve();
959 960
960#ifdef CONFIG_KVM_CLOCK 961#ifdef CONFIG_KVM_GUEST
961 kvmclock_init(); 962 kvmclock_init();
962#endif 963#endif
963 964
@@ -1032,6 +1033,8 @@ void __init setup_arch(char **cmdline_p)
1032 mcheck_init(); 1033 mcheck_init();
1033 1034
1034 arch_init_ideal_nops(); 1035 arch_init_ideal_nops();
1036
1037 register_refined_jiffies(CLOCK_TICK_RATE);
1035} 1038}
1036 1039
1037#ifdef CONFIG_X86_32 1040#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b280908a376e..29ad351804e9 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -114,11 +114,12 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
114 regs->orig_ax = -1; /* disable syscall checks */ 114 regs->orig_ax = -1; /* disable syscall checks */
115 115
116 get_user_ex(buf, &sc->fpstate); 116 get_user_ex(buf, &sc->fpstate);
117 err |= restore_i387_xstate(buf);
118 117
119 get_user_ex(*pax, &sc->ax); 118 get_user_ex(*pax, &sc->ax);
120 } get_user_catch(err); 119 } get_user_catch(err);
121 120
121 err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
122
122 return err; 123 return err;
123} 124}
124 125
@@ -206,35 +207,32 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
206 void __user **fpstate) 207 void __user **fpstate)
207{ 208{
208 /* Default to using normal stack */ 209 /* Default to using normal stack */
210 unsigned long math_size = 0;
209 unsigned long sp = regs->sp; 211 unsigned long sp = regs->sp;
212 unsigned long buf_fx = 0;
210 int onsigstack = on_sig_stack(sp); 213 int onsigstack = on_sig_stack(sp);
211 214
212#ifdef CONFIG_X86_64
213 /* redzone */ 215 /* redzone */
214 sp -= 128; 216 if (config_enabled(CONFIG_X86_64))
215#endif /* CONFIG_X86_64 */ 217 sp -= 128;
216 218
217 if (!onsigstack) { 219 if (!onsigstack) {
218 /* This is the X/Open sanctioned signal stack switching. */ 220 /* This is the X/Open sanctioned signal stack switching. */
219 if (ka->sa.sa_flags & SA_ONSTACK) { 221 if (ka->sa.sa_flags & SA_ONSTACK) {
220 if (current->sas_ss_size) 222 if (current->sas_ss_size)
221 sp = current->sas_ss_sp + current->sas_ss_size; 223 sp = current->sas_ss_sp + current->sas_ss_size;
222 } else { 224 } else if (config_enabled(CONFIG_X86_32) &&
223#ifdef CONFIG_X86_32 225 (regs->ss & 0xffff) != __USER_DS &&
224 /* This is the legacy signal stack switching. */ 226 !(ka->sa.sa_flags & SA_RESTORER) &&
225 if ((regs->ss & 0xffff) != __USER_DS && 227 ka->sa.sa_restorer) {
226 !(ka->sa.sa_flags & SA_RESTORER) && 228 /* This is the legacy signal stack switching. */
227 ka->sa.sa_restorer)
228 sp = (unsigned long) ka->sa.sa_restorer; 229 sp = (unsigned long) ka->sa.sa_restorer;
229#endif /* CONFIG_X86_32 */
230 } 230 }
231 } 231 }
232 232
233 if (used_math()) { 233 if (used_math()) {
234 sp -= sig_xstate_size; 234 sp = alloc_mathframe(sp, config_enabled(CONFIG_X86_32),
235#ifdef CONFIG_X86_64 235 &buf_fx, &math_size);
236 sp = round_down(sp, 64);
237#endif /* CONFIG_X86_64 */
238 *fpstate = (void __user *)sp; 236 *fpstate = (void __user *)sp;
239 } 237 }
240 238
@@ -247,8 +245,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
247 if (onsigstack && !likely(on_sig_stack(sp))) 245 if (onsigstack && !likely(on_sig_stack(sp)))
248 return (void __user *)-1L; 246 return (void __user *)-1L;
249 247
250 /* save i387 state */ 248 /* save i387 and extended state */
251 if (used_math() && save_i387_xstate(*fpstate) < 0) 249 if (used_math() &&
250 save_xstate_sig(*fpstate, (void __user *)buf_fx, math_size) < 0)
252 return (void __user *)-1L; 251 return (void __user *)-1L;
253 252
254 return (void __user *)sp; 253 return (void __user *)sp;
@@ -357,7 +356,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
357 put_user_ex(sig, &frame->sig); 356 put_user_ex(sig, &frame->sig);
358 put_user_ex(&frame->info, &frame->pinfo); 357 put_user_ex(&frame->info, &frame->pinfo);
359 put_user_ex(&frame->uc, &frame->puc); 358 put_user_ex(&frame->uc, &frame->puc);
360 err |= copy_siginfo_to_user(&frame->info, info);
361 359
362 /* Create the ucontext. */ 360 /* Create the ucontext. */
363 if (cpu_has_xsave) 361 if (cpu_has_xsave)
@@ -369,9 +367,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
369 put_user_ex(sas_ss_flags(regs->sp), 367 put_user_ex(sas_ss_flags(regs->sp),
370 &frame->uc.uc_stack.ss_flags); 368 &frame->uc.uc_stack.ss_flags);
371 put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 369 put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
372 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
373 regs, set->sig[0]);
374 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
375 370
376 /* Set up to return from userspace. */ 371 /* Set up to return from userspace. */
377 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); 372 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -388,6 +383,11 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
388 */ 383 */
389 put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode); 384 put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
390 } put_user_catch(err); 385 } put_user_catch(err);
386
387 err |= copy_siginfo_to_user(&frame->info, info);
388 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
389 regs, set->sig[0]);
390 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
391 391
392 if (err) 392 if (err)
393 return -EFAULT; 393 return -EFAULT;
@@ -436,8 +436,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
436 put_user_ex(sas_ss_flags(regs->sp), 436 put_user_ex(sas_ss_flags(regs->sp),
437 &frame->uc.uc_stack.ss_flags); 437 &frame->uc.uc_stack.ss_flags);
438 put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size); 438 put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
439 err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
440 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
441 439
442 /* Set up to return from userspace. If provided, use a stub 440 /* Set up to return from userspace. If provided, use a stub
443 already in userspace. */ 441 already in userspace. */
@@ -450,6 +448,9 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
450 } 448 }
451 } put_user_catch(err); 449 } put_user_catch(err);
452 450
451 err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
452 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
453
453 if (err) 454 if (err)
454 return -EFAULT; 455 return -EFAULT;
455 456
@@ -474,6 +475,75 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
474} 475}
475#endif /* CONFIG_X86_32 */ 476#endif /* CONFIG_X86_32 */
476 477
478static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
479 siginfo_t *info, compat_sigset_t *set,
480 struct pt_regs *regs)
481{
482#ifdef CONFIG_X86_X32_ABI
483 struct rt_sigframe_x32 __user *frame;
484 void __user *restorer;
485 int err = 0;
486 void __user *fpstate = NULL;
487
488 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
489
490 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
491 return -EFAULT;
492
493 if (ka->sa.sa_flags & SA_SIGINFO) {
494 if (copy_siginfo_to_user32(&frame->info, info))
495 return -EFAULT;
496 }
497
498 put_user_try {
499 /* Create the ucontext. */
500 if (cpu_has_xsave)
501 put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
502 else
503 put_user_ex(0, &frame->uc.uc_flags);
504 put_user_ex(0, &frame->uc.uc_link);
505 put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
506 put_user_ex(sas_ss_flags(regs->sp),
507 &frame->uc.uc_stack.ss_flags);
508 put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
509 put_user_ex(0, &frame->uc.uc__pad0);
510
511 if (ka->sa.sa_flags & SA_RESTORER) {
512 restorer = ka->sa.sa_restorer;
513 } else {
514 /* could use a vstub here */
515 restorer = NULL;
516 err |= -EFAULT;
517 }
518 put_user_ex(restorer, &frame->pretcode);
519 } put_user_catch(err);
520
521 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
522 regs, set->sig[0]);
523 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
524
525 if (err)
526 return -EFAULT;
527
528 /* Set up registers for signal handler */
529 regs->sp = (unsigned long) frame;
530 regs->ip = (unsigned long) ka->sa.sa_handler;
531
532 /* We use the x32 calling convention here... */
533 regs->di = sig;
534 regs->si = (unsigned long) &frame->info;
535 regs->dx = (unsigned long) &frame->uc;
536
537 loadsegment(ds, __USER_DS);
538 loadsegment(es, __USER_DS);
539
540 regs->cs = __USER_CS;
541 regs->ss = __USER_DS;
542#endif /* CONFIG_X86_X32_ABI */
543
544 return 0;
545}
546
477#ifdef CONFIG_X86_32 547#ifdef CONFIG_X86_32
478/* 548/*
479 * Atomically swap in the new signal mask, and wait for a signal. 549 * Atomically swap in the new signal mask, and wait for a signal.
@@ -612,55 +682,22 @@ static int signr_convert(int sig)
612 return sig; 682 return sig;
613} 683}
614 684
615#ifdef CONFIG_X86_32
616
617#define is_ia32 1
618#define ia32_setup_frame __setup_frame
619#define ia32_setup_rt_frame __setup_rt_frame
620
621#else /* !CONFIG_X86_32 */
622
623#ifdef CONFIG_IA32_EMULATION
624#define is_ia32 test_thread_flag(TIF_IA32)
625#else /* !CONFIG_IA32_EMULATION */
626#define is_ia32 0
627#endif /* CONFIG_IA32_EMULATION */
628
629#ifdef CONFIG_X86_X32_ABI
630#define is_x32 test_thread_flag(TIF_X32)
631
632static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
633 siginfo_t *info, compat_sigset_t *set,
634 struct pt_regs *regs);
635#else /* !CONFIG_X86_X32_ABI */
636#define is_x32 0
637#endif /* CONFIG_X86_X32_ABI */
638
639int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
640 sigset_t *set, struct pt_regs *regs);
641int ia32_setup_frame(int sig, struct k_sigaction *ka,
642 sigset_t *set, struct pt_regs *regs);
643
644#endif /* CONFIG_X86_32 */
645
646static int 685static int
647setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 686setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
648 struct pt_regs *regs) 687 struct pt_regs *regs)
649{ 688{
650 int usig = signr_convert(sig); 689 int usig = signr_convert(sig);
651 sigset_t *set = sigmask_to_save(); 690 sigset_t *set = sigmask_to_save();
691 compat_sigset_t *cset = (compat_sigset_t *) set;
652 692
653 /* Set up the stack frame */ 693 /* Set up the stack frame */
654 if (is_ia32) { 694 if (is_ia32_frame()) {
655 if (ka->sa.sa_flags & SA_SIGINFO) 695 if (ka->sa.sa_flags & SA_SIGINFO)
656 return ia32_setup_rt_frame(usig, ka, info, set, regs); 696 return ia32_setup_rt_frame(usig, ka, info, cset, regs);
657 else 697 else
658 return ia32_setup_frame(usig, ka, set, regs); 698 return ia32_setup_frame(usig, ka, cset, regs);
659#ifdef CONFIG_X86_X32_ABI 699 } else if (is_x32_frame()) {
660 } else if (is_x32) { 700 return x32_setup_rt_frame(usig, ka, info, cset, regs);
661 return x32_setup_rt_frame(usig, ka, info,
662 (compat_sigset_t *)set, regs);
663#endif
664 } else { 701 } else {
665 return __setup_rt_frame(sig, ka, info, set, regs); 702 return __setup_rt_frame(sig, ka, info, set, regs);
666 } 703 }
@@ -779,6 +816,8 @@ static void do_signal(struct pt_regs *regs)
779void 816void
780do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 817do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
781{ 818{
819 rcu_user_exit();
820
782#ifdef CONFIG_X86_MCE 821#ifdef CONFIG_X86_MCE
783 /* notify userspace of pending MCEs */ 822 /* notify userspace of pending MCEs */
784 if (thread_info_flags & _TIF_MCE_NOTIFY) 823 if (thread_info_flags & _TIF_MCE_NOTIFY)
@@ -801,9 +840,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
801 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) 840 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
802 fire_user_return_notifiers(); 841 fire_user_return_notifiers();
803 842
804#ifdef CONFIG_X86_32 843 rcu_user_enter();
805 clear_thread_flag(TIF_IRET);
806#endif /* CONFIG_X86_32 */
807} 844}
808 845
809void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 846void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
@@ -824,72 +861,6 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
824} 861}
825 862
826#ifdef CONFIG_X86_X32_ABI 863#ifdef CONFIG_X86_X32_ABI
827static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
828 siginfo_t *info, compat_sigset_t *set,
829 struct pt_regs *regs)
830{
831 struct rt_sigframe_x32 __user *frame;
832 void __user *restorer;
833 int err = 0;
834 void __user *fpstate = NULL;
835
836 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
837
838 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
839 return -EFAULT;
840
841 if (ka->sa.sa_flags & SA_SIGINFO) {
842 if (copy_siginfo_to_user32(&frame->info, info))
843 return -EFAULT;
844 }
845
846 put_user_try {
847 /* Create the ucontext. */
848 if (cpu_has_xsave)
849 put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
850 else
851 put_user_ex(0, &frame->uc.uc_flags);
852 put_user_ex(0, &frame->uc.uc_link);
853 put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
854 put_user_ex(sas_ss_flags(regs->sp),
855 &frame->uc.uc_stack.ss_flags);
856 put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
857 put_user_ex(0, &frame->uc.uc__pad0);
858 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
859 regs, set->sig[0]);
860 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
861
862 if (ka->sa.sa_flags & SA_RESTORER) {
863 restorer = ka->sa.sa_restorer;
864 } else {
865 /* could use a vstub here */
866 restorer = NULL;
867 err |= -EFAULT;
868 }
869 put_user_ex(restorer, &frame->pretcode);
870 } put_user_catch(err);
871
872 if (err)
873 return -EFAULT;
874
875 /* Set up registers for signal handler */
876 regs->sp = (unsigned long) frame;
877 regs->ip = (unsigned long) ka->sa.sa_handler;
878
879 /* We use the x32 calling convention here... */
880 regs->di = sig;
881 regs->si = (unsigned long) &frame->info;
882 regs->dx = (unsigned long) &frame->uc;
883
884 loadsegment(ds, __USER_DS);
885 loadsegment(es, __USER_DS);
886
887 regs->cs = __USER_CS;
888 regs->ss = __USER_DS;
889
890 return 0;
891}
892
893asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs) 864asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs)
894{ 865{
895 struct rt_sigframe_x32 __user *frame; 866 struct rt_sigframe_x32 __user *frame;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7c5a8c314c02..c80a33bc528b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -665,7 +665,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
665 unsigned long boot_error = 0; 665 unsigned long boot_error = 0;
666 int timeout; 666 int timeout;
667 667
668 alternatives_smp_switch(1); 668 /* Just in case we booted with a single CPU. */
669 alternatives_enable_smp();
669 670
670 idle->thread.sp = (unsigned long) (((struct pt_regs *) 671 idle->thread.sp = (unsigned long) (((struct pt_regs *)
671 (THREAD_SIZE + task_stack_page(idle))) - 1); 672 (THREAD_SIZE + task_stack_page(idle))) - 1);
@@ -1053,20 +1054,6 @@ out:
1053 preempt_enable(); 1054 preempt_enable();
1054} 1055}
1055 1056
1056void arch_disable_nonboot_cpus_begin(void)
1057{
1058 /*
1059 * Avoid the smp alternatives switch during the disable_nonboot_cpus().
1060 * In the suspend path, we will be back in the SMP mode shortly anyways.
1061 */
1062 skip_smp_alternatives = true;
1063}
1064
1065void arch_disable_nonboot_cpus_end(void)
1066{
1067 skip_smp_alternatives = false;
1068}
1069
1070void arch_enable_nonboot_cpus_begin(void) 1057void arch_enable_nonboot_cpus_begin(void)
1071{ 1058{
1072 set_mtrr_aps_delayed_init(); 1059 set_mtrr_aps_delayed_init();
@@ -1256,9 +1243,6 @@ void native_cpu_die(unsigned int cpu)
1256 if (per_cpu(cpu_state, cpu) == CPU_DEAD) { 1243 if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
1257 if (system_state == SYSTEM_RUNNING) 1244 if (system_state == SYSTEM_RUNNING)
1258 pr_info("CPU %u is now offline\n", cpu); 1245 pr_info("CPU %u is now offline\n", cpu);
1259
1260 if (1 == num_online_cpus())
1261 alternatives_smp_switch(0);
1262 return; 1246 return;
1263 } 1247 }
1264 msleep(100); 1248 msleep(100);
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index c346d1161488..cd3b2438a980 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -157,6 +157,33 @@ static int enable_single_step(struct task_struct *child)
157 return 1; 157 return 1;
158} 158}
159 159
160void set_task_blockstep(struct task_struct *task, bool on)
161{
162 unsigned long debugctl;
163
164 /*
165 * Ensure irq/preemption can't change debugctl in between.
166 * Note also that both TIF_BLOCKSTEP and debugctl should
167 * be changed atomically wrt preemption.
168 * FIXME: this means that set/clear TIF_BLOCKSTEP is simply
169 * wrong if task != current, SIGKILL can wakeup the stopped
170 * tracee and set/clear can play with the running task, this
171 * can confuse the next __switch_to_xtra().
172 */
173 local_irq_disable();
174 debugctl = get_debugctlmsr();
175 if (on) {
176 debugctl |= DEBUGCTLMSR_BTF;
177 set_tsk_thread_flag(task, TIF_BLOCKSTEP);
178 } else {
179 debugctl &= ~DEBUGCTLMSR_BTF;
180 clear_tsk_thread_flag(task, TIF_BLOCKSTEP);
181 }
182 if (task == current)
183 update_debugctlmsr(debugctl);
184 local_irq_enable();
185}
186
160/* 187/*
161 * Enable single or block step. 188 * Enable single or block step.
162 */ 189 */
@@ -169,19 +196,10 @@ static void enable_step(struct task_struct *child, bool block)
169 * So no one should try to use debugger block stepping in a program 196 * So no one should try to use debugger block stepping in a program
170 * that uses user-mode single stepping itself. 197 * that uses user-mode single stepping itself.
171 */ 198 */
172 if (enable_single_step(child) && block) { 199 if (enable_single_step(child) && block)
173 unsigned long debugctl = get_debugctlmsr(); 200 set_task_blockstep(child, true);
174 201 else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP))
175 debugctl |= DEBUGCTLMSR_BTF; 202 set_task_blockstep(child, false);
176 update_debugctlmsr(debugctl);
177 set_tsk_thread_flag(child, TIF_BLOCKSTEP);
178 } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
179 unsigned long debugctl = get_debugctlmsr();
180
181 debugctl &= ~DEBUGCTLMSR_BTF;
182 update_debugctlmsr(debugctl);
183 clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
184 }
185} 203}
186 204
187void user_enable_single_step(struct task_struct *child) 205void user_enable_single_step(struct task_struct *child)
@@ -199,13 +217,8 @@ void user_disable_single_step(struct task_struct *child)
199 /* 217 /*
200 * Make sure block stepping (BTF) is disabled. 218 * Make sure block stepping (BTF) is disabled.
201 */ 219 */
202 if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) { 220 if (test_tsk_thread_flag(child, TIF_BLOCKSTEP))
203 unsigned long debugctl = get_debugctlmsr(); 221 set_task_blockstep(child, false);
204
205 debugctl &= ~DEBUGCTLMSR_BTF;
206 update_debugctlmsr(debugctl);
207 clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
208 }
209 222
210 /* Always clear TIF_SINGLESTEP... */ 223 /* Always clear TIF_SINGLESTEP... */
211 clear_tsk_thread_flag(child, TIF_SINGLESTEP); 224 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
deleted file mode 100644
index 0b0cb5fede19..000000000000
--- a/arch/x86/kernel/sys_i386_32.c
+++ /dev/null
@@ -1,40 +0,0 @@
1/*
2 * This file contains various random system calls that
3 * have a non-standard calling sequence on the Linux/i386
4 * platform.
5 */
6
7#include <linux/errno.h>
8#include <linux/sched.h>
9#include <linux/mm.h>
10#include <linux/fs.h>
11#include <linux/smp.h>
12#include <linux/sem.h>
13#include <linux/msg.h>
14#include <linux/shm.h>
15#include <linux/stat.h>
16#include <linux/syscalls.h>
17#include <linux/mman.h>
18#include <linux/file.h>
19#include <linux/utsname.h>
20#include <linux/ipc.h>
21
22#include <linux/uaccess.h>
23#include <linux/unistd.h>
24
25#include <asm/syscalls.h>
26
27/*
28 * Do a system call from kernel instead of calling sys_execve so we
29 * end up with proper pt_regs.
30 */
31int kernel_execve(const char *filename,
32 const char *const argv[],
33 const char *const envp[])
34{
35 long __res;
36 asm volatile ("int $0x80"
37 : "=a" (__res)
38 : "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory");
39 return __res;
40}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b481341c9369..8276dc6794cc 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -55,6 +55,7 @@
55#include <asm/i387.h> 55#include <asm/i387.h>
56#include <asm/fpu-internal.h> 56#include <asm/fpu-internal.h>
57#include <asm/mce.h> 57#include <asm/mce.h>
58#include <asm/rcu.h>
58 59
59#include <asm/mach_traps.h> 60#include <asm/mach_traps.h>
60 61
@@ -107,30 +108,45 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
107 dec_preempt_count(); 108 dec_preempt_count();
108} 109}
109 110
110static void __kprobes 111static int __kprobes
111do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, 112do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
112 long error_code, siginfo_t *info) 113 struct pt_regs *regs, long error_code)
113{ 114{
114 struct task_struct *tsk = current;
115
116#ifdef CONFIG_X86_32 115#ifdef CONFIG_X86_32
117 if (regs->flags & X86_VM_MASK) { 116 if (regs->flags & X86_VM_MASK) {
118 /* 117 /*
119 * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. 118 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
120 * On nmi (interrupt 2), do_trap should not be called. 119 * On nmi (interrupt 2), do_trap should not be called.
121 */ 120 */
122 if (trapnr < X86_TRAP_UD) 121 if (trapnr < X86_TRAP_UD) {
123 goto vm86_trap; 122 if (!handle_vm86_trap((struct kernel_vm86_regs *) regs,
124 goto trap_signal; 123 error_code, trapnr))
124 return 0;
125 }
126 return -1;
125 } 127 }
126#endif 128#endif
129 if (!user_mode(regs)) {
130 if (!fixup_exception(regs)) {
131 tsk->thread.error_code = error_code;
132 tsk->thread.trap_nr = trapnr;
133 die(str, regs, error_code);
134 }
135 return 0;
136 }
127 137
128 if (!user_mode(regs)) 138 return -1;
129 goto kernel_trap; 139}
130 140
131#ifdef CONFIG_X86_32 141static void __kprobes
132trap_signal: 142do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
133#endif 143 long error_code, siginfo_t *info)
144{
145 struct task_struct *tsk = current;
146
147
148 if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
149 return;
134 /* 150 /*
135 * We want error_code and trap_nr set for userspace faults and 151 * We want error_code and trap_nr set for userspace faults and
136 * kernelspace faults which result in die(), but not 152 * kernelspace faults which result in die(), but not
@@ -158,33 +174,20 @@ trap_signal:
158 force_sig_info(signr, info, tsk); 174 force_sig_info(signr, info, tsk);
159 else 175 else
160 force_sig(signr, tsk); 176 force_sig(signr, tsk);
161 return;
162
163kernel_trap:
164 if (!fixup_exception(regs)) {
165 tsk->thread.error_code = error_code;
166 tsk->thread.trap_nr = trapnr;
167 die(str, regs, error_code);
168 }
169 return;
170
171#ifdef CONFIG_X86_32
172vm86_trap:
173 if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
174 error_code, trapnr))
175 goto trap_signal;
176 return;
177#endif
178} 177}
179 178
180#define DO_ERROR(trapnr, signr, str, name) \ 179#define DO_ERROR(trapnr, signr, str, name) \
181dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ 180dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
182{ \ 181{ \
183 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 182 exception_enter(regs); \
184 == NOTIFY_STOP) \ 183 if (notify_die(DIE_TRAP, str, regs, error_code, \
184 trapnr, signr) == NOTIFY_STOP) { \
185 exception_exit(regs); \
185 return; \ 186 return; \
187 } \
186 conditional_sti(regs); \ 188 conditional_sti(regs); \
187 do_trap(trapnr, signr, str, regs, error_code, NULL); \ 189 do_trap(trapnr, signr, str, regs, error_code, NULL); \
190 exception_exit(regs); \
188} 191}
189 192
190#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 193#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
@@ -195,11 +198,15 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
195 info.si_errno = 0; \ 198 info.si_errno = 0; \
196 info.si_code = sicode; \ 199 info.si_code = sicode; \
197 info.si_addr = (void __user *)siaddr; \ 200 info.si_addr = (void __user *)siaddr; \
198 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 201 exception_enter(regs); \
199 == NOTIFY_STOP) \ 202 if (notify_die(DIE_TRAP, str, regs, error_code, \
203 trapnr, signr) == NOTIFY_STOP) { \
204 exception_exit(regs); \
200 return; \ 205 return; \
206 } \
201 conditional_sti(regs); \ 207 conditional_sti(regs); \
202 do_trap(trapnr, signr, str, regs, error_code, &info); \ 208 do_trap(trapnr, signr, str, regs, error_code, &info); \
209 exception_exit(regs); \
203} 210}
204 211
205DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, 212DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV,
@@ -222,12 +229,14 @@ DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check,
222/* Runs on IST stack */ 229/* Runs on IST stack */
223dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) 230dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
224{ 231{
232 exception_enter(regs);
225 if (notify_die(DIE_TRAP, "stack segment", regs, error_code, 233 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
226 X86_TRAP_SS, SIGBUS) == NOTIFY_STOP) 234 X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) {
227 return; 235 preempt_conditional_sti(regs);
228 preempt_conditional_sti(regs); 236 do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
229 do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); 237 preempt_conditional_cli(regs);
230 preempt_conditional_cli(regs); 238 }
239 exception_exit(regs);
231} 240}
232 241
233dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) 242dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
@@ -235,6 +244,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
235 static const char str[] = "double fault"; 244 static const char str[] = "double fault";
236 struct task_struct *tsk = current; 245 struct task_struct *tsk = current;
237 246
247 exception_enter(regs);
238 /* Return not checked because double check cannot be ignored */ 248 /* Return not checked because double check cannot be ignored */
239 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); 249 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
240 250
@@ -255,16 +265,29 @@ do_general_protection(struct pt_regs *regs, long error_code)
255{ 265{
256 struct task_struct *tsk; 266 struct task_struct *tsk;
257 267
268 exception_enter(regs);
258 conditional_sti(regs); 269 conditional_sti(regs);
259 270
260#ifdef CONFIG_X86_32 271#ifdef CONFIG_X86_32
261 if (regs->flags & X86_VM_MASK) 272 if (regs->flags & X86_VM_MASK) {
262 goto gp_in_vm86; 273 local_irq_enable();
274 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
275 goto exit;
276 }
263#endif 277#endif
264 278
265 tsk = current; 279 tsk = current;
266 if (!user_mode(regs)) 280 if (!user_mode(regs)) {
267 goto gp_in_kernel; 281 if (fixup_exception(regs))
282 goto exit;
283
284 tsk->thread.error_code = error_code;
285 tsk->thread.trap_nr = X86_TRAP_GP;
286 if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
287 X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
288 die("general protection fault", regs, error_code);
289 goto exit;
290 }
268 291
269 tsk->thread.error_code = error_code; 292 tsk->thread.error_code = error_code;
270 tsk->thread.trap_nr = X86_TRAP_GP; 293 tsk->thread.trap_nr = X86_TRAP_GP;
@@ -279,25 +302,8 @@ do_general_protection(struct pt_regs *regs, long error_code)
279 } 302 }
280 303
281 force_sig(SIGSEGV, tsk); 304 force_sig(SIGSEGV, tsk);
282 return; 305exit:
283 306 exception_exit(regs);
284#ifdef CONFIG_X86_32
285gp_in_vm86:
286 local_irq_enable();
287 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
288 return;
289#endif
290
291gp_in_kernel:
292 if (fixup_exception(regs))
293 return;
294
295 tsk->thread.error_code = error_code;
296 tsk->thread.trap_nr = X86_TRAP_GP;
297 if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
298 X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
299 return;
300 die("general protection fault", regs, error_code);
301} 307}
302 308
303/* May run on IST stack. */ 309/* May run on IST stack. */
@@ -312,15 +318,16 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
312 ftrace_int3_handler(regs)) 318 ftrace_int3_handler(regs))
313 return; 319 return;
314#endif 320#endif
321 exception_enter(regs);
315#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP 322#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
316 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, 323 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
317 SIGTRAP) == NOTIFY_STOP) 324 SIGTRAP) == NOTIFY_STOP)
318 return; 325 goto exit;
319#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ 326#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
320 327
321 if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, 328 if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
322 SIGTRAP) == NOTIFY_STOP) 329 SIGTRAP) == NOTIFY_STOP)
323 return; 330 goto exit;
324 331
325 /* 332 /*
326 * Let others (NMI) know that the debug stack is in use 333 * Let others (NMI) know that the debug stack is in use
@@ -331,6 +338,8 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
331 do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); 338 do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
332 preempt_conditional_cli(regs); 339 preempt_conditional_cli(regs);
333 debug_stack_usage_dec(); 340 debug_stack_usage_dec();
341exit:
342 exception_exit(regs);
334} 343}
335 344
336#ifdef CONFIG_X86_64 345#ifdef CONFIG_X86_64
@@ -391,6 +400,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
391 unsigned long dr6; 400 unsigned long dr6;
392 int si_code; 401 int si_code;
393 402
403 exception_enter(regs);
404
394 get_debugreg(dr6, 6); 405 get_debugreg(dr6, 6);
395 406
396 /* Filter out all the reserved bits which are preset to 1 */ 407 /* Filter out all the reserved bits which are preset to 1 */
@@ -406,7 +417,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
406 417
407 /* Catch kmemcheck conditions first of all! */ 418 /* Catch kmemcheck conditions first of all! */
408 if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) 419 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
409 return; 420 goto exit;
410 421
411 /* DR6 may or may not be cleared by the CPU */ 422 /* DR6 may or may not be cleared by the CPU */
412 set_debugreg(0, 6); 423 set_debugreg(0, 6);
@@ -421,7 +432,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
421 432
422 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, 433 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
423 SIGTRAP) == NOTIFY_STOP) 434 SIGTRAP) == NOTIFY_STOP)
424 return; 435 goto exit;
425 436
426 /* 437 /*
427 * Let others (NMI) know that the debug stack is in use 438 * Let others (NMI) know that the debug stack is in use
@@ -437,7 +448,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
437 X86_TRAP_DB); 448 X86_TRAP_DB);
438 preempt_conditional_cli(regs); 449 preempt_conditional_cli(regs);
439 debug_stack_usage_dec(); 450 debug_stack_usage_dec();
440 return; 451 goto exit;
441 } 452 }
442 453
443 /* 454 /*
@@ -458,7 +469,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
458 preempt_conditional_cli(regs); 469 preempt_conditional_cli(regs);
459 debug_stack_usage_dec(); 470 debug_stack_usage_dec();
460 471
461 return; 472exit:
473 exception_exit(regs);
462} 474}
463 475
464/* 476/*
@@ -555,14 +567,17 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
555#ifdef CONFIG_X86_32 567#ifdef CONFIG_X86_32
556 ignore_fpu_irq = 1; 568 ignore_fpu_irq = 1;
557#endif 569#endif
558 570 exception_enter(regs);
559 math_error(regs, error_code, X86_TRAP_MF); 571 math_error(regs, error_code, X86_TRAP_MF);
572 exception_exit(regs);
560} 573}
561 574
562dotraplinkage void 575dotraplinkage void
563do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 576do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
564{ 577{
578 exception_enter(regs);
565 math_error(regs, error_code, X86_TRAP_XF); 579 math_error(regs, error_code, X86_TRAP_XF);
580 exception_exit(regs);
566} 581}
567 582
568dotraplinkage void 583dotraplinkage void
@@ -613,11 +628,12 @@ void math_state_restore(void)
613 } 628 }
614 629
615 __thread_fpu_begin(tsk); 630 __thread_fpu_begin(tsk);
631
616 /* 632 /*
617 * Paranoid restore. send a SIGSEGV if we fail to restore the state. 633 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
618 */ 634 */
619 if (unlikely(restore_fpu_checking(tsk))) { 635 if (unlikely(restore_fpu_checking(tsk))) {
620 __thread_fpu_end(tsk); 636 drop_init_fpu(tsk);
621 force_sig(SIGSEGV, tsk); 637 force_sig(SIGSEGV, tsk);
622 return; 638 return;
623 } 639 }
@@ -629,6 +645,9 @@ EXPORT_SYMBOL_GPL(math_state_restore);
629dotraplinkage void __kprobes 645dotraplinkage void __kprobes
630do_device_not_available(struct pt_regs *regs, long error_code) 646do_device_not_available(struct pt_regs *regs, long error_code)
631{ 647{
648 exception_enter(regs);
649 BUG_ON(use_eager_fpu());
650
632#ifdef CONFIG_MATH_EMULATION 651#ifdef CONFIG_MATH_EMULATION
633 if (read_cr0() & X86_CR0_EM) { 652 if (read_cr0() & X86_CR0_EM) {
634 struct math_emu_info info = { }; 653 struct math_emu_info info = { };
@@ -637,6 +656,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
637 656
638 info.regs = regs; 657 info.regs = regs;
639 math_emulate(&info); 658 math_emulate(&info);
659 exception_exit(regs);
640 return; 660 return;
641 } 661 }
642#endif 662#endif
@@ -644,12 +664,15 @@ do_device_not_available(struct pt_regs *regs, long error_code)
644#ifdef CONFIG_X86_32 664#ifdef CONFIG_X86_32
645 conditional_sti(regs); 665 conditional_sti(regs);
646#endif 666#endif
667 exception_exit(regs);
647} 668}
648 669
649#ifdef CONFIG_X86_32 670#ifdef CONFIG_X86_32
650dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) 671dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
651{ 672{
652 siginfo_t info; 673 siginfo_t info;
674
675 exception_enter(regs);
653 local_irq_enable(); 676 local_irq_enable();
654 677
655 info.si_signo = SIGILL; 678 info.si_signo = SIGILL;
@@ -657,10 +680,11 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
657 info.si_code = ILL_BADSTK; 680 info.si_code = ILL_BADSTK;
658 info.si_addr = NULL; 681 info.si_addr = NULL;
659 if (notify_die(DIE_TRAP, "iret exception", regs, error_code, 682 if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
660 X86_TRAP_IRET, SIGILL) == NOTIFY_STOP) 683 X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
661 return; 684 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
662 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, 685 &info);
663 &info); 686 }
687 exception_exit(regs);
664} 688}
665#endif 689#endif
666 690
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 36fd42091fa7..9538f00827a9 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -41,6 +41,9 @@
41/* Adjust the return address of a call insn */ 41/* Adjust the return address of a call insn */
42#define UPROBE_FIX_CALL 0x2 42#define UPROBE_FIX_CALL 0x2
43 43
44/* Instruction will modify TF, don't change it */
45#define UPROBE_FIX_SETF 0x4
46
44#define UPROBE_FIX_RIP_AX 0x8000 47#define UPROBE_FIX_RIP_AX 0x8000
45#define UPROBE_FIX_RIP_CX 0x4000 48#define UPROBE_FIX_RIP_CX 0x4000
46 49
@@ -239,6 +242,10 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
239 insn_get_opcode(insn); /* should be a nop */ 242 insn_get_opcode(insn); /* should be a nop */
240 243
241 switch (OPCODE1(insn)) { 244 switch (OPCODE1(insn)) {
245 case 0x9d:
246 /* popf */
247 auprobe->fixups |= UPROBE_FIX_SETF;
248 break;
242 case 0xc3: /* ret/lret */ 249 case 0xc3: /* ret/lret */
243 case 0xcb: 250 case 0xcb:
244 case 0xc2: 251 case 0xc2:
@@ -646,7 +653,7 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
646 * Skip these instructions as per the currently known x86 ISA. 653 * Skip these instructions as per the currently known x86 ISA.
647 * 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 } 654 * 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 }
648 */ 655 */
649bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) 656static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
650{ 657{
651 int i; 658 int i;
652 659
@@ -673,3 +680,46 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
673 } 680 }
674 return false; 681 return false;
675} 682}
683
684bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
685{
686 bool ret = __skip_sstep(auprobe, regs);
687 if (ret && (regs->flags & X86_EFLAGS_TF))
688 send_sig(SIGTRAP, current, 0);
689 return ret;
690}
691
692void arch_uprobe_enable_step(struct arch_uprobe *auprobe)
693{
694 struct task_struct *task = current;
695 struct arch_uprobe_task *autask = &task->utask->autask;
696 struct pt_regs *regs = task_pt_regs(task);
697
698 autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF);
699
700 regs->flags |= X86_EFLAGS_TF;
701 if (test_tsk_thread_flag(task, TIF_BLOCKSTEP))
702 set_task_blockstep(task, false);
703}
704
705void arch_uprobe_disable_step(struct arch_uprobe *auprobe)
706{
707 struct task_struct *task = current;
708 struct arch_uprobe_task *autask = &task->utask->autask;
709 bool trapped = (task->utask->state == UTASK_SSTEP_TRAPPED);
710 struct pt_regs *regs = task_pt_regs(task);
711 /*
712 * The state of TIF_BLOCKSTEP was not saved so we can get an extra
713 * SIGTRAP if we do not clear TF. We need to examine the opcode to
714 * make it right.
715 */
716 if (unlikely(trapped)) {
717 if (!autask->saved_tf)
718 regs->flags &= ~X86_EFLAGS_TF;
719 } else {
720 if (autask->saved_tf)
721 send_sig(SIGTRAP, task, 0);
722 else if (!(auprobe->fixups & UPROBE_FIX_SETF))
723 regs->flags &= ~X86_EFLAGS_TF;
724 }
725}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 54abcc0baf23..5c9687b1bde6 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -561,9 +561,9 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
561 if ((trapno == 3) || (trapno == 1)) { 561 if ((trapno == 3) || (trapno == 1)) {
562 KVM86->regs32->ax = VM86_TRAP + (trapno << 8); 562 KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
563 /* setting this flag forces the code in entry_32.S to 563 /* setting this flag forces the code in entry_32.S to
564 call save_v86_state() and change the stack pointer 564 the path where we call save_v86_state() and change
565 to KVM86->regs32 */ 565 the stack pointer to KVM86->regs32 */
566 set_thread_flag(TIF_IRET); 566 set_thread_flag(TIF_NOTIFY_RESUME);
567 return 0; 567 return 0;
568 } 568 }
569 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); 569 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8d141b309046..3a3e8c9e280d 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -28,7 +28,7 @@
28#include <linux/jiffies.h> 28#include <linux/jiffies.h>
29#include <linux/sysctl.h> 29#include <linux/sysctl.h>
30#include <linux/topology.h> 30#include <linux/topology.h>
31#include <linux/clocksource.h> 31#include <linux/timekeeper_internal.h>
32#include <linux/getcpu.h> 32#include <linux/getcpu.h>
33#include <linux/cpu.h> 33#include <linux/cpu.h>
34#include <linux/smp.h> 34#include <linux/smp.h>
@@ -82,32 +82,41 @@ void update_vsyscall_tz(void)
82 vsyscall_gtod_data.sys_tz = sys_tz; 82 vsyscall_gtod_data.sys_tz = sys_tz;
83} 83}
84 84
85void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, 85void update_vsyscall(struct timekeeper *tk)
86 struct clocksource *clock, u32 mult)
87{ 86{
88 struct timespec monotonic; 87 struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
89 88
90 write_seqcount_begin(&vsyscall_gtod_data.seq); 89 write_seqcount_begin(&vdata->seq);
91 90
92 /* copy vsyscall data */ 91 /* copy vsyscall data */
93 vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; 92 vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode;
94 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; 93 vdata->clock.cycle_last = tk->clock->cycle_last;
95 vsyscall_gtod_data.clock.mask = clock->mask; 94 vdata->clock.mask = tk->clock->mask;
96 vsyscall_gtod_data.clock.mult = mult; 95 vdata->clock.mult = tk->mult;
97 vsyscall_gtod_data.clock.shift = clock->shift; 96 vdata->clock.shift = tk->shift;
98 97
99 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 98 vdata->wall_time_sec = tk->xtime_sec;
100 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 99 vdata->wall_time_snsec = tk->xtime_nsec;
100
101 vdata->monotonic_time_sec = tk->xtime_sec
102 + tk->wall_to_monotonic.tv_sec;
103 vdata->monotonic_time_snsec = tk->xtime_nsec
104 + (tk->wall_to_monotonic.tv_nsec
105 << tk->shift);
106 while (vdata->monotonic_time_snsec >=
107 (((u64)NSEC_PER_SEC) << tk->shift)) {
108 vdata->monotonic_time_snsec -=
109 ((u64)NSEC_PER_SEC) << tk->shift;
110 vdata->monotonic_time_sec++;
111 }
101 112
102 monotonic = timespec_add(*wall_time, *wtm); 113 vdata->wall_time_coarse.tv_sec = tk->xtime_sec;
103 vsyscall_gtod_data.monotonic_time_sec = monotonic.tv_sec; 114 vdata->wall_time_coarse.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
104 vsyscall_gtod_data.monotonic_time_nsec = monotonic.tv_nsec;
105 115
106 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); 116 vdata->monotonic_time_coarse = timespec_add(vdata->wall_time_coarse,
107 vsyscall_gtod_data.monotonic_time_coarse = 117 tk->wall_to_monotonic);
108 timespec_add(vsyscall_gtod_data.wall_time_coarse, *wtm);
109 118
110 write_seqcount_end(&vsyscall_gtod_data.seq); 119 write_seqcount_end(&vdata->seq);
111} 120}
112 121
113static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, 122static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 6020f6f5927c..1330dd102950 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -13,9 +13,13 @@
13#include <asm/ftrace.h> 13#include <asm/ftrace.h>
14 14
15#ifdef CONFIG_FUNCTION_TRACER 15#ifdef CONFIG_FUNCTION_TRACER
16/* mcount is defined in assembly */ 16/* mcount and __fentry__ are defined in assembly */
17#ifdef CC_USING_FENTRY
18EXPORT_SYMBOL(__fentry__);
19#else
17EXPORT_SYMBOL(mcount); 20EXPORT_SYMBOL(mcount);
18#endif 21#endif
22#endif
19 23
20EXPORT_SYMBOL(__get_user_1); 24EXPORT_SYMBOL(__get_user_1);
21EXPORT_SYMBOL(__get_user_2); 25EXPORT_SYMBOL(__get_user_2);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 3d3e20709119..ada87a329edc 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -10,9 +10,7 @@
10#include <linux/compat.h> 10#include <linux/compat.h>
11#include <asm/i387.h> 11#include <asm/i387.h>
12#include <asm/fpu-internal.h> 12#include <asm/fpu-internal.h>
13#ifdef CONFIG_IA32_EMULATION 13#include <asm/sigframe.h>
14#include <asm/sigcontext32.h>
15#endif
16#include <asm/xcr.h> 14#include <asm/xcr.h>
17 15
18/* 16/*
@@ -23,13 +21,9 @@ u64 pcntxt_mask;
23/* 21/*
24 * Represents init state for the supported extended state. 22 * Represents init state for the supported extended state.
25 */ 23 */
26static struct xsave_struct *init_xstate_buf; 24struct xsave_struct *init_xstate_buf;
27
28struct _fpx_sw_bytes fx_sw_reserved;
29#ifdef CONFIG_IA32_EMULATION
30struct _fpx_sw_bytes fx_sw_reserved_ia32;
31#endif
32 25
26static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
33static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; 27static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
34 28
35/* 29/*
@@ -44,9 +38,9 @@ static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
44 */ 38 */
45void __sanitize_i387_state(struct task_struct *tsk) 39void __sanitize_i387_state(struct task_struct *tsk)
46{ 40{
47 u64 xstate_bv;
48 int feature_bit = 0x2;
49 struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; 41 struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
42 int feature_bit = 0x2;
43 u64 xstate_bv;
50 44
51 if (!fx) 45 if (!fx)
52 return; 46 return;
@@ -104,213 +98,326 @@ void __sanitize_i387_state(struct task_struct *tsk)
104 * Check for the presence of extended state information in the 98 * Check for the presence of extended state information in the
105 * user fpstate pointer in the sigcontext. 99 * user fpstate pointer in the sigcontext.
106 */ 100 */
107int check_for_xstate(struct i387_fxsave_struct __user *buf, 101static inline int check_for_xstate(struct i387_fxsave_struct __user *buf,
108 void __user *fpstate, 102 void __user *fpstate,
109 struct _fpx_sw_bytes *fx_sw_user) 103 struct _fpx_sw_bytes *fx_sw)
110{ 104{
111 int min_xstate_size = sizeof(struct i387_fxsave_struct) + 105 int min_xstate_size = sizeof(struct i387_fxsave_struct) +
112 sizeof(struct xsave_hdr_struct); 106 sizeof(struct xsave_hdr_struct);
113 unsigned int magic2; 107 unsigned int magic2;
114 int err;
115 108
116 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], 109 if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
117 sizeof(struct _fpx_sw_bytes)); 110 return -1;
118 if (err)
119 return -EFAULT;
120 111
121 /* 112 /* Check for the first magic field and other error scenarios. */
122 * First Magic check failed. 113 if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
123 */ 114 fx_sw->xstate_size < min_xstate_size ||
124 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) 115 fx_sw->xstate_size > xstate_size ||
125 return -EINVAL; 116 fx_sw->xstate_size > fx_sw->extended_size)
117 return -1;
126 118
127 /* 119 /*
128 * Check for error scenarios.
129 */
130 if (fx_sw_user->xstate_size < min_xstate_size ||
131 fx_sw_user->xstate_size > xstate_size ||
132 fx_sw_user->xstate_size > fx_sw_user->extended_size)
133 return -EINVAL;
134
135 err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
136 fx_sw_user->extended_size -
137 FP_XSTATE_MAGIC2_SIZE));
138 if (err)
139 return err;
140 /*
141 * Check for the presence of second magic word at the end of memory 120 * Check for the presence of second magic word at the end of memory
142 * layout. This detects the case where the user just copied the legacy 121 * layout. This detects the case where the user just copied the legacy
143 * fpstate layout with out copying the extended state information 122 * fpstate layout with out copying the extended state information
144 * in the memory layout. 123 * in the memory layout.
145 */ 124 */
146 if (magic2 != FP_XSTATE_MAGIC2) 125 if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
147 return -EFAULT; 126 || magic2 != FP_XSTATE_MAGIC2)
127 return -1;
148 128
149 return 0; 129 return 0;
150} 130}
151 131
152#ifdef CONFIG_X86_64
153/* 132/*
154 * Signal frame handlers. 133 * Signal frame handlers.
155 */ 134 */
156 135static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
157int save_i387_xstate(void __user *buf)
158{ 136{
159 struct task_struct *tsk = current; 137 if (use_fxsr()) {
160 int err = 0; 138 struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
161 139 struct user_i387_ia32_struct env;
162 if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size)) 140 struct _fpstate_ia32 __user *fp = buf;
163 return -EACCES;
164 141
165 BUG_ON(sig_xstate_size < xstate_size); 142 convert_from_fxsr(&env, tsk);
166 143
167 if ((unsigned long)buf % 64) 144 if (__copy_to_user(buf, &env, sizeof(env)) ||
168 pr_err("%s: bad fpstate %p\n", __func__, buf); 145 __put_user(xsave->i387.swd, &fp->status) ||
169 146 __put_user(X86_FXSR_MAGIC, &fp->magic))
170 if (!used_math()) 147 return -1;
171 return 0;
172
173 if (user_has_fpu()) {
174 if (use_xsave())
175 err = xsave_user(buf);
176 else
177 err = fxsave_user(buf);
178
179 if (err)
180 return err;
181 user_fpu_end();
182 } else { 148 } else {
183 sanitize_i387_state(tsk); 149 struct i387_fsave_struct __user *fp = buf;
184 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, 150 u32 swd;
185 xstate_size)) 151 if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
186 return -1; 152 return -1;
187 } 153 }
188 154
189 clear_used_math(); /* trigger finit */ 155 return 0;
156}
190 157
191 if (use_xsave()) { 158static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
192 struct _fpstate __user *fx = buf; 159{
193 struct _xstate __user *x = buf; 160 struct xsave_struct __user *x = buf;
194 u64 xstate_bv; 161 struct _fpx_sw_bytes *sw_bytes;
162 u32 xstate_bv;
163 int err;
195 164
196 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved, 165 /* Setup the bytes not touched by the [f]xsave and reserved for SW. */
197 sizeof(struct _fpx_sw_bytes)); 166 sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
167 err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
198 168
199 err |= __put_user(FP_XSTATE_MAGIC2, 169 if (!use_xsave())
200 (__u32 __user *) (buf + sig_xstate_size 170 return err;
201 - FP_XSTATE_MAGIC2_SIZE));
202 171
203 /* 172 err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size));
204 * Read the xstate_bv which we copied (directly from the cpu or
205 * from the state in task struct) to the user buffers and
206 * set the FP/SSE bits.
207 */
208 err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv);
209 173
210 /* 174 /*
211 * For legacy compatible, we always set FP/SSE bits in the bit 175 * Read the xstate_bv which we copied (directly from the cpu or
212 * vector while saving the state to the user context. This will 176 * from the state in task struct) to the user buffers.
213 * enable us capturing any changes(during sigreturn) to 177 */
214 * the FP/SSE bits by the legacy applications which don't touch 178 err |= __get_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv);
215 * xstate_bv in the xsave header.
216 *
217 * xsave aware apps can change the xstate_bv in the xsave
218 * header as well as change any contents in the memory layout.
219 * xrestore as part of sigreturn will capture all the changes.
220 */
221 xstate_bv |= XSTATE_FPSSE;
222 179
223 err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv); 180 /*
181 * For legacy compatible, we always set FP/SSE bits in the bit
182 * vector while saving the state to the user context. This will
183 * enable us capturing any changes(during sigreturn) to
184 * the FP/SSE bits by the legacy applications which don't touch
185 * xstate_bv in the xsave header.
186 *
187 * xsave aware apps can change the xstate_bv in the xsave
188 * header as well as change any contents in the memory layout.
189 * xrestore as part of sigreturn will capture all the changes.
190 */
191 xstate_bv |= XSTATE_FPSSE;
224 192
225 if (err) 193 err |= __put_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv);
226 return err;
227 }
228 194
229 return 1; 195 return err;
196}
197
198static inline int save_user_xstate(struct xsave_struct __user *buf)
199{
200 int err;
201
202 if (use_xsave())
203 err = xsave_user(buf);
204 else if (use_fxsr())
205 err = fxsave_user((struct i387_fxsave_struct __user *) buf);
206 else
207 err = fsave_user((struct i387_fsave_struct __user *) buf);
208
209 if (unlikely(err) && __clear_user(buf, xstate_size))
210 err = -EFAULT;
211 return err;
230} 212}
231 213
232/* 214/*
233 * Restore the extended state if present. Otherwise, restore the FP/SSE 215 * Save the fpu, extended register state to the user signal frame.
234 * state. 216 *
217 * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
218 * state is copied.
219 * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
220 *
221 * buf == buf_fx for 64-bit frames and 32-bit fsave frame.
222 * buf != buf_fx for 32-bit frames with fxstate.
223 *
224 * If the fpu, extended register state is live, save the state directly
225 * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise,
226 * copy the thread's fpu state to the user frame starting at 'buf_fx'.
227 *
228 * If this is a 32-bit frame with fxstate, put a fsave header before
229 * the aligned state at 'buf_fx'.
230 *
231 * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
232 * indicating the absence/presence of the extended state to the user.
235 */ 233 */
236static int restore_user_xstate(void __user *buf) 234int save_xstate_sig(void __user *buf, void __user *buf_fx, int size)
237{ 235{
238 struct _fpx_sw_bytes fx_sw_user; 236 struct xsave_struct *xsave = &current->thread.fpu.state->xsave;
239 u64 mask; 237 struct task_struct *tsk = current;
240 int err; 238 int ia32_fxstate = (buf != buf_fx);
241 239
242 if (((unsigned long)buf % 64) || 240 ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
243 check_for_xstate(buf, buf, &fx_sw_user)) 241 config_enabled(CONFIG_IA32_EMULATION));
244 goto fx_only;
245 242
246 mask = fx_sw_user.xstate_bv; 243 if (!access_ok(VERIFY_WRITE, buf, size))
244 return -EACCES;
247 245
248 /* 246 if (!HAVE_HWFP)
249 * restore the state passed by the user. 247 return fpregs_soft_get(current, NULL, 0,
250 */ 248 sizeof(struct user_i387_ia32_struct), NULL,
251 err = xrestore_user(buf, mask); 249 (struct _fpstate_ia32 __user *) buf) ? -1 : 1;
252 if (err)
253 return err;
254 250
255 /* 251 if (user_has_fpu()) {
256 * init the state skipped by the user. 252 /* Save the live register state to the user directly. */
257 */ 253 if (save_user_xstate(buf_fx))
258 mask = pcntxt_mask & ~mask; 254 return -1;
259 if (unlikely(mask)) 255 /* Update the thread's fxstate to save the fsave header. */
260 xrstor_state(init_xstate_buf, mask); 256 if (ia32_fxstate)
257 fpu_fxsave(&tsk->thread.fpu);
258 } else {
259 sanitize_i387_state(tsk);
260 if (__copy_to_user(buf_fx, xsave, xstate_size))
261 return -1;
262 }
263
264 /* Save the fsave header for the 32-bit frames. */
265 if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
266 return -1;
267
268 if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
269 return -1;
270
271 drop_init_fpu(tsk); /* trigger finit */
261 272
262 return 0; 273 return 0;
274}
263 275
264fx_only: 276static inline void
265 /* 277sanitize_restored_xstate(struct task_struct *tsk,
266 * couldn't find the extended state information in the 278 struct user_i387_ia32_struct *ia32_env,
267 * memory layout. Restore just the FP/SSE and init all 279 u64 xstate_bv, int fx_only)
268 * the other extended state. 280{
269 */ 281 struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
270 xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE); 282 struct xsave_hdr_struct *xsave_hdr = &xsave->xsave_hdr;
271 return fxrstor_checking((__force struct i387_fxsave_struct *)buf); 283
284 if (use_xsave()) {
285 /* These bits must be zero. */
286 xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
287
288 /*
289 * Init the state that is not present in the memory
290 * layout and not enabled by the OS.
291 */
292 if (fx_only)
293 xsave_hdr->xstate_bv = XSTATE_FPSSE;
294 else
295 xsave_hdr->xstate_bv &= (pcntxt_mask & xstate_bv);
296 }
297
298 if (use_fxsr()) {
299 /*
300 * mscsr reserved bits must be masked to zero for security
301 * reasons.
302 */
303 xsave->i387.mxcsr &= mxcsr_feature_mask;
304
305 convert_to_fxsr(tsk, ia32_env);
306 }
272} 307}
273 308
274/* 309/*
275 * This restores directly out of user space. Exceptions are handled. 310 * Restore the extended state if present. Otherwise, restore the FP/SSE state.
276 */ 311 */
277int restore_i387_xstate(void __user *buf) 312static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only)
278{ 313{
314 if (use_xsave()) {
315 if ((unsigned long)buf % 64 || fx_only) {
316 u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE;
317 xrstor_state(init_xstate_buf, init_bv);
318 return fxrstor_user(buf);
319 } else {
320 u64 init_bv = pcntxt_mask & ~xbv;
321 if (unlikely(init_bv))
322 xrstor_state(init_xstate_buf, init_bv);
323 return xrestore_user(buf, xbv);
324 }
325 } else if (use_fxsr()) {
326 return fxrstor_user(buf);
327 } else
328 return frstor_user(buf);
329}
330
331int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
332{
333 int ia32_fxstate = (buf != buf_fx);
279 struct task_struct *tsk = current; 334 struct task_struct *tsk = current;
280 int err = 0; 335 int state_size = xstate_size;
336 u64 xstate_bv = 0;
337 int fx_only = 0;
338
339 ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
340 config_enabled(CONFIG_IA32_EMULATION));
281 341
282 if (!buf) { 342 if (!buf) {
283 if (used_math()) 343 drop_init_fpu(tsk);
284 goto clear;
285 return 0; 344 return 0;
286 } else 345 }
287 if (!access_ok(VERIFY_READ, buf, sig_xstate_size))
288 return -EACCES;
289 346
290 if (!used_math()) { 347 if (!access_ok(VERIFY_READ, buf, size))
291 err = init_fpu(tsk); 348 return -EACCES;
292 if (err) 349
293 return err; 350 if (!used_math() && init_fpu(tsk))
351 return -1;
352
353 if (!HAVE_HWFP) {
354 return fpregs_soft_set(current, NULL,
355 0, sizeof(struct user_i387_ia32_struct),
356 NULL, buf) != 0;
294 } 357 }
295 358
296 user_fpu_begin(); 359 if (use_xsave()) {
297 if (use_xsave()) 360 struct _fpx_sw_bytes fx_sw_user;
298 err = restore_user_xstate(buf); 361 if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
299 else 362 /*
300 err = fxrstor_checking((__force struct i387_fxsave_struct *) 363 * Couldn't find the extended state information in the
301 buf); 364 * memory layout. Restore just the FP/SSE and init all
302 if (unlikely(err)) { 365 * the other extended state.
366 */
367 state_size = sizeof(struct i387_fxsave_struct);
368 fx_only = 1;
369 } else {
370 state_size = fx_sw_user.xstate_size;
371 xstate_bv = fx_sw_user.xstate_bv;
372 }
373 }
374
375 if (ia32_fxstate) {
376 /*
377 * For 32-bit frames with fxstate, copy the user state to the
378 * thread's fpu state, reconstruct fxstate from the fsave
379 * header. Sanitize the copied state etc.
380 */
381 struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
382 struct user_i387_ia32_struct env;
383 int err = 0;
384
385 /*
386 * Drop the current fpu which clears used_math(). This ensures
387 * that any context-switch during the copy of the new state,
388 * avoids the intermediate state from getting restored/saved.
389 * Thus avoiding the new restored state from getting corrupted.
390 * We will be ready to restore/save the state only after
391 * set_used_math() is again set.
392 */
393 drop_fpu(tsk);
394
395 if (__copy_from_user(xsave, buf_fx, state_size) ||
396 __copy_from_user(&env, buf, sizeof(env))) {
397 err = -1;
398 } else {
399 sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only);
400 set_used_math();
401 }
402
403 if (use_eager_fpu())
404 math_state_restore();
405
406 return err;
407 } else {
303 /* 408 /*
304 * Encountered an error while doing the restore from the 409 * For 64-bit frames and 32-bit fsave frames, restore the user
305 * user buffer, clear the fpu state. 410 * state to the registers directly (with exceptions handled).
306 */ 411 */
307clear: 412 user_fpu_begin();
308 clear_fpu(tsk); 413 if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) {
309 clear_used_math(); 414 drop_init_fpu(tsk);
415 return -1;
416 }
310 } 417 }
311 return err; 418
419 return 0;
312} 420}
313#endif
314 421
315/* 422/*
316 * Prepare the SW reserved portion of the fxsave memory layout, indicating 423 * Prepare the SW reserved portion of the fxsave memory layout, indicating
@@ -321,31 +428,22 @@ clear:
321 */ 428 */
322static void prepare_fx_sw_frame(void) 429static void prepare_fx_sw_frame(void)
323{ 430{
324 int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) + 431 int fsave_header_size = sizeof(struct i387_fsave_struct);
325 FP_XSTATE_MAGIC2_SIZE; 432 int size = xstate_size + FP_XSTATE_MAGIC2_SIZE;
326 433
327 sig_xstate_size = sizeof(struct _fpstate) + size_extended; 434 if (config_enabled(CONFIG_X86_32))
328 435 size += fsave_header_size;
329#ifdef CONFIG_IA32_EMULATION
330 sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended;
331#endif
332
333 memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved));
334 436
335 fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; 437 fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
336 fx_sw_reserved.extended_size = sig_xstate_size; 438 fx_sw_reserved.extended_size = size;
337 fx_sw_reserved.xstate_bv = pcntxt_mask; 439 fx_sw_reserved.xstate_bv = pcntxt_mask;
338 fx_sw_reserved.xstate_size = xstate_size; 440 fx_sw_reserved.xstate_size = xstate_size;
339#ifdef CONFIG_IA32_EMULATION
340 memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved,
341 sizeof(struct _fpx_sw_bytes));
342 fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size;
343#endif
344}
345 441
346#ifdef CONFIG_X86_64 442 if (config_enabled(CONFIG_IA32_EMULATION)) {
347unsigned int sig_xstate_size = sizeof(struct _fpstate); 443 fx_sw_reserved_ia32 = fx_sw_reserved;
348#endif 444 fx_sw_reserved_ia32.extended_size += fsave_header_size;
445 }
446}
349 447
350/* 448/*
351 * Enable the extended processor state save/restore feature 449 * Enable the extended processor state save/restore feature
@@ -384,19 +482,21 @@ static void __init setup_xstate_features(void)
384/* 482/*
385 * setup the xstate image representing the init state 483 * setup the xstate image representing the init state
386 */ 484 */
387static void __init setup_xstate_init(void) 485static void __init setup_init_fpu_buf(void)
388{ 486{
389 setup_xstate_features();
390
391 /* 487 /*
392 * Setup init_xstate_buf to represent the init state of 488 * Setup init_xstate_buf to represent the init state of
393 * all the features managed by the xsave 489 * all the features managed by the xsave
394 */ 490 */
395 init_xstate_buf = alloc_bootmem_align(xstate_size, 491 init_xstate_buf = alloc_bootmem_align(xstate_size,
396 __alignof__(struct xsave_struct)); 492 __alignof__(struct xsave_struct));
397 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; 493 fx_finit(&init_xstate_buf->i387);
494
495 if (!cpu_has_xsave)
496 return;
497
498 setup_xstate_features();
398 499
399 clts();
400 /* 500 /*
401 * Init all the features state with header_bv being 0x0 501 * Init all the features state with header_bv being 0x0
402 */ 502 */
@@ -406,9 +506,21 @@ static void __init setup_xstate_init(void)
406 * of any feature which is not represented by all zero's. 506 * of any feature which is not represented by all zero's.
407 */ 507 */
408 xsave_state(init_xstate_buf, -1); 508 xsave_state(init_xstate_buf, -1);
409 stts();
410} 509}
411 510
511static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
512static int __init eager_fpu_setup(char *s)
513{
514 if (!strcmp(s, "on"))
515 eagerfpu = ENABLE;
516 else if (!strcmp(s, "off"))
517 eagerfpu = DISABLE;
518 else if (!strcmp(s, "auto"))
519 eagerfpu = AUTO;
520 return 1;
521}
522__setup("eagerfpu=", eager_fpu_setup);
523
412/* 524/*
413 * Enable and initialize the xsave feature. 525 * Enable and initialize the xsave feature.
414 */ 526 */
@@ -445,8 +557,11 @@ static void __init xstate_enable_boot_cpu(void)
445 557
446 update_regset_xstate_info(xstate_size, pcntxt_mask); 558 update_regset_xstate_info(xstate_size, pcntxt_mask);
447 prepare_fx_sw_frame(); 559 prepare_fx_sw_frame();
560 setup_init_fpu_buf();
448 561
449 setup_xstate_init(); 562 /* Auto enable eagerfpu for xsaveopt */
563 if (cpu_has_xsaveopt && eagerfpu != DISABLE)
564 eagerfpu = ENABLE;
450 565
451 pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", 566 pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
452 pcntxt_mask, xstate_size); 567 pcntxt_mask, xstate_size);
@@ -471,3 +586,43 @@ void __cpuinit xsave_init(void)
471 next_func = xstate_enable; 586 next_func = xstate_enable;
472 this_func(); 587 this_func();
473} 588}
589
590static inline void __init eager_fpu_init_bp(void)
591{
592 current->thread.fpu.state =
593 alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
594 if (!init_xstate_buf)
595 setup_init_fpu_buf();
596}
597
598void __cpuinit eager_fpu_init(void)
599{
600 static __refdata void (*boot_func)(void) = eager_fpu_init_bp;
601
602 clear_used_math();
603 current_thread_info()->status = 0;
604
605 if (eagerfpu == ENABLE)
606 setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
607
608 if (!cpu_has_eager_fpu) {
609 stts();
610 return;
611 }
612
613 if (boot_func) {
614 boot_func();
615 boot_func = NULL;
616 }
617
618 /*
619 * This is same as math_state_restore(). But use_xsave() is
620 * not yet patched to use math_state_restore().
621 */
622 init_fpu(current);
623 __thread_fpu_begin(current);
624 if (cpu_has_xsave)
625 xrstor_state(init_xstate_buf, -1);
626 else
627 fxrstor_checking(&init_xstate_buf->i387);
628}