diff options
Diffstat (limited to 'arch/x86/kernel')
54 files changed, 1292 insertions, 786 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 91ce48f05f9f..34e923a53762 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
| @@ -9,7 +9,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) | |||
| 9 | ifdef CONFIG_FUNCTION_TRACER | 9 | ifdef CONFIG_FUNCTION_TRACER |
| 10 | # Do not profile debug and lowlevel utilities | 10 | # Do not profile debug and lowlevel utilities |
| 11 | CFLAGS_REMOVE_tsc.o = -pg | 11 | CFLAGS_REMOVE_tsc.o = -pg |
| 12 | CFLAGS_REMOVE_rtc.o = -pg | ||
| 13 | CFLAGS_REMOVE_paravirt-spinlocks.o = -pg | 12 | CFLAGS_REMOVE_paravirt-spinlocks.o = -pg |
| 14 | CFLAGS_REMOVE_pvclock.o = -pg | 13 | CFLAGS_REMOVE_pvclock.o = -pg |
| 15 | CFLAGS_REMOVE_kvmclock.o = -pg | 14 | CFLAGS_REMOVE_kvmclock.o = -pg |
| @@ -62,6 +61,7 @@ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o | |||
| 62 | obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o | 61 | obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o |
| 63 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o | 62 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o |
| 64 | obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o | 63 | obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o |
| 64 | obj-$(CONFIG_X86_TSC) += trace_clock.o | ||
| 65 | obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o | 65 | obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o |
| 66 | obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o | 66 | obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o |
| 67 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o | 67 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o |
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e651f7a589ac..bacf4b0d91f4 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c | |||
| @@ -574,6 +574,12 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) | |||
| 574 | 574 | ||
| 575 | return irq; | 575 | return irq; |
| 576 | } | 576 | } |
| 577 | EXPORT_SYMBOL_GPL(acpi_register_gsi); | ||
| 578 | |||
| 579 | void acpi_unregister_gsi(u32 gsi) | ||
| 580 | { | ||
| 581 | } | ||
| 582 | EXPORT_SYMBOL_GPL(acpi_unregister_gsi); | ||
| 577 | 583 | ||
| 578 | void __init acpi_set_irq_model_pic(void) | 584 | void __init acpi_set_irq_model_pic(void) |
| 579 | { | 585 | { |
| @@ -1700,3 +1706,9 @@ int __acpi_release_global_lock(unsigned int *lock) | |||
| 1700 | } while (unlikely (val != old)); | 1706 | } while (unlikely (val != old)); |
| 1701 | return old & 0x1; | 1707 | return old & 0x1; |
| 1702 | } | 1708 | } |
| 1709 | |||
| 1710 | void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) | ||
| 1711 | { | ||
| 1712 | e820_add_region(addr, size, E820_ACPI); | ||
| 1713 | update_e820(); | ||
| 1714 | } | ||
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 11676cf65aee..d5e0d717005a 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c | |||
| @@ -101,6 +101,8 @@ static int __init acpi_sleep_setup(char *str) | |||
| 101 | #endif | 101 | #endif |
| 102 | if (strncmp(str, "nonvs", 5) == 0) | 102 | if (strncmp(str, "nonvs", 5) == 0) |
| 103 | acpi_nvs_nosave(); | 103 | acpi_nvs_nosave(); |
| 104 | if (strncmp(str, "nonvs_s3", 8) == 0) | ||
| 105 | acpi_nvs_nosave_s3(); | ||
| 104 | if (strncmp(str, "old_ordering", 12) == 0) | 106 | if (strncmp(str, "old_ordering", 12) == 0) |
| 105 | acpi_old_suspend_ordering(); | 107 | acpi_old_suspend_ordering(); |
| 106 | str = strchr(str, ','); | 108 | str = strchr(str, ','); |
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b17416e72fbd..b994cc84aa7e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
| @@ -90,21 +90,6 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); | |||
| 90 | */ | 90 | */ |
| 91 | DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); | 91 | DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); |
| 92 | 92 | ||
| 93 | /* | ||
| 94 | * Knob to control our willingness to enable the local APIC. | ||
| 95 | * | ||
| 96 | * +1=force-enable | ||
| 97 | */ | ||
| 98 | static int force_enable_local_apic __initdata; | ||
| 99 | /* | ||
| 100 | * APIC command line parameters | ||
| 101 | */ | ||
| 102 | static int __init parse_lapic(char *arg) | ||
| 103 | { | ||
| 104 | force_enable_local_apic = 1; | ||
| 105 | return 0; | ||
| 106 | } | ||
| 107 | early_param("lapic", parse_lapic); | ||
| 108 | /* Local APIC was disabled by the BIOS and enabled by the kernel */ | 93 | /* Local APIC was disabled by the BIOS and enabled by the kernel */ |
| 109 | static int enabled_via_apicbase; | 94 | static int enabled_via_apicbase; |
| 110 | 95 | ||
| @@ -133,6 +118,25 @@ static inline void imcr_apic_to_pic(void) | |||
| 133 | } | 118 | } |
| 134 | #endif | 119 | #endif |
| 135 | 120 | ||
| 121 | /* | ||
| 122 | * Knob to control our willingness to enable the local APIC. | ||
| 123 | * | ||
| 124 | * +1=force-enable | ||
| 125 | */ | ||
| 126 | static int force_enable_local_apic __initdata; | ||
| 127 | /* | ||
| 128 | * APIC command line parameters | ||
| 129 | */ | ||
| 130 | static int __init parse_lapic(char *arg) | ||
| 131 | { | ||
| 132 | if (config_enabled(CONFIG_X86_32) && !arg) | ||
| 133 | force_enable_local_apic = 1; | ||
| 134 | else if (!strncmp(arg, "notscdeadline", 13)) | ||
| 135 | setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); | ||
| 136 | return 0; | ||
| 137 | } | ||
| 138 | early_param("lapic", parse_lapic); | ||
| 139 | |||
| 136 | #ifdef CONFIG_X86_64 | 140 | #ifdef CONFIG_X86_64 |
| 137 | static int apic_calibrate_pmtmr __initdata; | 141 | static int apic_calibrate_pmtmr __initdata; |
| 138 | static __init int setup_apicpmtimer(char *s) | 142 | static __init int setup_apicpmtimer(char *s) |
| @@ -315,6 +319,7 @@ int lapic_get_maxlvt(void) | |||
| 315 | 319 | ||
| 316 | /* Clock divisor */ | 320 | /* Clock divisor */ |
| 317 | #define APIC_DIVISOR 16 | 321 | #define APIC_DIVISOR 16 |
| 322 | #define TSC_DIVISOR 32 | ||
| 318 | 323 | ||
| 319 | /* | 324 | /* |
| 320 | * This function sets up the local APIC timer, with a timeout of | 325 | * This function sets up the local APIC timer, with a timeout of |
| @@ -333,6 +338,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) | |||
| 333 | lvtt_value = LOCAL_TIMER_VECTOR; | 338 | lvtt_value = LOCAL_TIMER_VECTOR; |
| 334 | if (!oneshot) | 339 | if (!oneshot) |
| 335 | lvtt_value |= APIC_LVT_TIMER_PERIODIC; | 340 | lvtt_value |= APIC_LVT_TIMER_PERIODIC; |
| 341 | else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) | ||
| 342 | lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE; | ||
| 343 | |||
| 336 | if (!lapic_is_integrated()) | 344 | if (!lapic_is_integrated()) |
| 337 | lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); | 345 | lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); |
| 338 | 346 | ||
| @@ -341,6 +349,11 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) | |||
| 341 | 349 | ||
| 342 | apic_write(APIC_LVTT, lvtt_value); | 350 | apic_write(APIC_LVTT, lvtt_value); |
| 343 | 351 | ||
| 352 | if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) { | ||
| 353 | printk_once(KERN_DEBUG "TSC deadline timer enabled\n"); | ||
| 354 | return; | ||
| 355 | } | ||
| 356 | |||
| 344 | /* | 357 | /* |
| 345 | * Divide PICLK by 16 | 358 | * Divide PICLK by 16 |
| 346 | */ | 359 | */ |
| @@ -453,6 +466,16 @@ static int lapic_next_event(unsigned long delta, | |||
| 453 | return 0; | 466 | return 0; |
| 454 | } | 467 | } |
| 455 | 468 | ||
| 469 | static int lapic_next_deadline(unsigned long delta, | ||
| 470 | struct clock_event_device *evt) | ||
| 471 | { | ||
| 472 | u64 tsc; | ||
| 473 | |||
| 474 | rdtscll(tsc); | ||
| 475 | wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); | ||
| 476 | return 0; | ||
| 477 | } | ||
| 478 | |||
| 456 | /* | 479 | /* |
| 457 | * Setup the lapic timer in periodic or oneshot mode | 480 | * Setup the lapic timer in periodic or oneshot mode |
| 458 | */ | 481 | */ |
| @@ -533,7 +556,15 @@ static void __cpuinit setup_APIC_timer(void) | |||
| 533 | memcpy(levt, &lapic_clockevent, sizeof(*levt)); | 556 | memcpy(levt, &lapic_clockevent, sizeof(*levt)); |
| 534 | levt->cpumask = cpumask_of(smp_processor_id()); | 557 | levt->cpumask = cpumask_of(smp_processor_id()); |
| 535 | 558 | ||
| 536 | clockevents_register_device(levt); | 559 | if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { |
| 560 | levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | | ||
| 561 | CLOCK_EVT_FEAT_DUMMY); | ||
| 562 | levt->set_next_event = lapic_next_deadline; | ||
| 563 | clockevents_config_and_register(levt, | ||
| 564 | (tsc_khz / TSC_DIVISOR) * 1000, | ||
| 565 | 0xF, ~0UL); | ||
| 566 | } else | ||
| 567 | clockevents_register_device(levt); | ||
| 537 | } | 568 | } |
| 538 | 569 | ||
| 539 | /* | 570 | /* |
| @@ -661,7 +692,9 @@ static int __init calibrate_APIC_clock(void) | |||
| 661 | * in the clockevent structure and return. | 692 | * in the clockevent structure and return. |
| 662 | */ | 693 | */ |
| 663 | 694 | ||
| 664 | if (lapic_timer_frequency) { | 695 | if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { |
| 696 | return 0; | ||
| 697 | } else if (lapic_timer_frequency) { | ||
| 665 | apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", | 698 | apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", |
| 666 | lapic_timer_frequency); | 699 | lapic_timer_frequency); |
| 667 | lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, | 700 | lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, |
| @@ -674,6 +707,9 @@ static int __init calibrate_APIC_clock(void) | |||
| 674 | return 0; | 707 | return 0; |
| 675 | } | 708 | } |
| 676 | 709 | ||
| 710 | apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" | ||
| 711 | "calibrating APIC timer ...\n"); | ||
| 712 | |||
| 677 | local_irq_disable(); | 713 | local_irq_disable(); |
| 678 | 714 | ||
| 679 | /* Replace the global interrupt handler */ | 715 | /* Replace the global interrupt handler */ |
| @@ -811,9 +847,6 @@ void __init setup_boot_APIC_clock(void) | |||
| 811 | return; | 847 | return; |
| 812 | } | 848 | } |
| 813 | 849 | ||
| 814 | apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" | ||
| 815 | "calibrating APIC timer ...\n"); | ||
| 816 | |||
| 817 | if (calibrate_APIC_clock()) { | 850 | if (calibrate_APIC_clock()) { |
| 818 | /* No broadcast on UP ! */ | 851 | /* No broadcast on UP ! */ |
| 819 | if (num_possible_cpus() > 1) | 852 | if (num_possible_cpus() > 1) |
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index a65829ac2b9a..9c2aa89a11cb 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #include <linux/hardirq.h> | 22 | #include <linux/hardirq.h> |
| 23 | #include <linux/delay.h> | 23 | #include <linux/delay.h> |
| 24 | 24 | ||
| 25 | #include <asm/numachip/numachip.h> | ||
| 25 | #include <asm/numachip/numachip_csr.h> | 26 | #include <asm/numachip/numachip_csr.h> |
| 26 | #include <asm/smp.h> | 27 | #include <asm/smp.h> |
| 27 | #include <asm/apic.h> | 28 | #include <asm/apic.h> |
| @@ -179,6 +180,7 @@ static int __init numachip_system_init(void) | |||
| 179 | return 0; | 180 | return 0; |
| 180 | 181 | ||
| 181 | x86_cpuinit.fixup_cpu_id = fixup_cpu_id; | 182 | x86_cpuinit.fixup_cpu_id = fixup_cpu_id; |
| 183 | x86_init.pci.arch_init = pci_numachip_init; | ||
| 182 | 184 | ||
| 183 | map_csrs(); | 185 | map_csrs(); |
| 184 | 186 | ||
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1817fa911024..b739d398bb29 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c | |||
| @@ -234,11 +234,11 @@ int __init arch_early_irq_init(void) | |||
| 234 | zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); | 234 | zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); |
| 235 | /* | 235 | /* |
| 236 | * For legacy IRQ's, start with assigning irq0 to irq15 to | 236 | * For legacy IRQ's, start with assigning irq0 to irq15 to |
| 237 | * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. | 237 | * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. |
| 238 | */ | 238 | */ |
| 239 | if (i < legacy_pic->nr_legacy_irqs) { | 239 | if (i < legacy_pic->nr_legacy_irqs) { |
| 240 | cfg[i].vector = IRQ0_VECTOR + i; | 240 | cfg[i].vector = IRQ0_VECTOR + i; |
| 241 | cpumask_set_cpu(0, cfg[i].domain); | 241 | cpumask_setall(cfg[i].domain); |
| 242 | } | 242 | } |
| 243 | } | 243 | } |
| 244 | 244 | ||
| @@ -1141,7 +1141,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) | |||
| 1141 | * allocation for the members that are not used anymore. | 1141 | * allocation for the members that are not used anymore. |
| 1142 | */ | 1142 | */ |
| 1143 | cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); | 1143 | cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); |
| 1144 | cfg->move_in_progress = 1; | 1144 | cfg->move_in_progress = |
| 1145 | cpumask_intersects(cfg->old_domain, cpu_online_mask); | ||
| 1145 | cpumask_and(cfg->domain, cfg->domain, tmp_mask); | 1146 | cpumask_and(cfg->domain, cfg->domain, tmp_mask); |
| 1146 | break; | 1147 | break; |
| 1147 | } | 1148 | } |
| @@ -1172,8 +1173,9 @@ next: | |||
| 1172 | current_vector = vector; | 1173 | current_vector = vector; |
| 1173 | current_offset = offset; | 1174 | current_offset = offset; |
| 1174 | if (cfg->vector) { | 1175 | if (cfg->vector) { |
| 1175 | cfg->move_in_progress = 1; | ||
| 1176 | cpumask_copy(cfg->old_domain, cfg->domain); | 1176 | cpumask_copy(cfg->old_domain, cfg->domain); |
| 1177 | cfg->move_in_progress = | ||
| 1178 | cpumask_intersects(cfg->old_domain, cpu_online_mask); | ||
| 1177 | } | 1179 | } |
| 1178 | for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) | 1180 | for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) |
| 1179 | per_cpu(vector_irq, new_cpu)[vector] = irq; | 1181 | per_cpu(vector_irq, new_cpu)[vector] = irq; |
| @@ -1241,12 +1243,6 @@ void __setup_vector_irq(int cpu) | |||
| 1241 | cfg = irq_get_chip_data(irq); | 1243 | cfg = irq_get_chip_data(irq); |
| 1242 | if (!cfg) | 1244 | if (!cfg) |
| 1243 | continue; | 1245 | continue; |
| 1244 | /* | ||
| 1245 | * If it is a legacy IRQ handled by the legacy PIC, this cpu | ||
| 1246 | * will be part of the irq_cfg's domain. | ||
| 1247 | */ | ||
| 1248 | if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq)) | ||
| 1249 | cpumask_set_cpu(cpu, cfg->domain); | ||
| 1250 | 1246 | ||
| 1251 | if (!cpumask_test_cpu(cpu, cfg->domain)) | 1247 | if (!cpumask_test_cpu(cpu, cfg->domain)) |
| 1252 | continue; | 1248 | continue; |
| @@ -1356,16 +1352,6 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, | |||
| 1356 | if (!IO_APIC_IRQ(irq)) | 1352 | if (!IO_APIC_IRQ(irq)) |
| 1357 | return; | 1353 | return; |
| 1358 | 1354 | ||
| 1359 | /* | ||
| 1360 | * For legacy irqs, cfg->domain starts with cpu 0. Now that IO-APIC | ||
| 1361 | * can handle this irq and the apic driver is finialized at this point, | ||
| 1362 | * update the cfg->domain. | ||
| 1363 | */ | ||
| 1364 | if (irq < legacy_pic->nr_legacy_irqs && | ||
| 1365 | cpumask_equal(cfg->domain, cpumask_of(0))) | ||
| 1366 | apic->vector_allocation_domain(0, cfg->domain, | ||
| 1367 | apic->target_cpus()); | ||
| 1368 | |||
| 1369 | if (assign_irq_vector(irq, cfg, apic->target_cpus())) | 1355 | if (assign_irq_vector(irq, cfg, apic->target_cpus())) |
| 1370 | return; | 1356 | return; |
| 1371 | 1357 | ||
| @@ -2199,9 +2185,11 @@ static int ioapic_retrigger_irq(struct irq_data *data) | |||
| 2199 | { | 2185 | { |
| 2200 | struct irq_cfg *cfg = data->chip_data; | 2186 | struct irq_cfg *cfg = data->chip_data; |
| 2201 | unsigned long flags; | 2187 | unsigned long flags; |
| 2188 | int cpu; | ||
| 2202 | 2189 | ||
| 2203 | raw_spin_lock_irqsave(&vector_lock, flags); | 2190 | raw_spin_lock_irqsave(&vector_lock, flags); |
| 2204 | apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); | 2191 | cpu = cpumask_first_and(cfg->domain, cpu_online_mask); |
| 2192 | apic->send_IPI_mask(cpumask_of(cpu), cfg->vector); | ||
| 2205 | raw_spin_unlock_irqrestore(&vector_lock, flags); | 2193 | raw_spin_unlock_irqrestore(&vector_lock, flags); |
| 2206 | 2194 | ||
| 2207 | return 1; | 2195 | return 1; |
| @@ -3317,8 +3305,9 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) | |||
| 3317 | int ret; | 3305 | int ret; |
| 3318 | 3306 | ||
| 3319 | if (irq_remapping_enabled) { | 3307 | if (irq_remapping_enabled) { |
| 3320 | if (!setup_hpet_msi_remapped(irq, id)) | 3308 | ret = setup_hpet_msi_remapped(irq, id); |
| 3321 | return -1; | 3309 | if (ret) |
| 3310 | return ret; | ||
| 3322 | } | 3311 | } |
| 3323 | 3312 | ||
| 3324 | ret = msi_compose_msg(NULL, irq, &msg, id); | 3313 | ret = msi_compose_msg(NULL, irq, &msg, id); |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f7e98a2c0d12..15239fffd6fe 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
| @@ -304,7 +304,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) | |||
| 304 | int cpu = smp_processor_id(); | 304 | int cpu = smp_processor_id(); |
| 305 | 305 | ||
| 306 | /* get information required for multi-node processors */ | 306 | /* get information required for multi-node processors */ |
| 307 | if (cpu_has(c, X86_FEATURE_TOPOEXT)) { | 307 | if (cpu_has_topoext) { |
| 308 | u32 eax, ebx, ecx, edx; | 308 | u32 eax, ebx, ecx, edx; |
| 309 | 309 | ||
| 310 | cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); | 310 | cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); |
| @@ -631,6 +631,20 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
| 631 | } | 631 | } |
| 632 | } | 632 | } |
| 633 | 633 | ||
| 634 | /* | ||
| 635 | * The way access filter has a performance penalty on some workloads. | ||
| 636 | * Disable it on the affected CPUs. | ||
| 637 | */ | ||
| 638 | if ((c->x86 == 0x15) && | ||
| 639 | (c->x86_model >= 0x02) && (c->x86_model < 0x20)) { | ||
| 640 | u64 val; | ||
| 641 | |||
| 642 | if (!rdmsrl_safe(0xc0011021, &val) && !(val & 0x1E)) { | ||
| 643 | val |= 0x1E; | ||
| 644 | wrmsrl_safe(0xc0011021, val); | ||
| 645 | } | ||
| 646 | } | ||
| 647 | |||
| 634 | cpu_detect_cache_sizes(c); | 648 | cpu_detect_cache_sizes(c); |
| 635 | 649 | ||
| 636 | /* Multi core CPU? */ | 650 | /* Multi core CPU? */ |
| @@ -643,12 +657,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
| 643 | detect_ht(c); | 657 | detect_ht(c); |
| 644 | #endif | 658 | #endif |
| 645 | 659 | ||
| 646 | if (c->extended_cpuid_level >= 0x80000006) { | 660 | init_amd_cacheinfo(c); |
| 647 | if (cpuid_edx(0x80000006) & 0xf000) | ||
| 648 | num_cache_leaves = 4; | ||
| 649 | else | ||
| 650 | num_cache_leaves = 3; | ||
| 651 | } | ||
| 652 | 661 | ||
| 653 | if (c->x86 >= 0xf) | 662 | if (c->x86 >= 0xf) |
| 654 | set_cpu_cap(c, X86_FEATURE_K8); | 663 | set_cpu_cap(c, X86_FEATURE_K8); |
| @@ -739,9 +748,6 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, | |||
| 739 | 748 | ||
| 740 | static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) | 749 | static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) |
| 741 | { | 750 | { |
| 742 | if (!cpu_has_invlpg) | ||
| 743 | return; | ||
| 744 | |||
| 745 | tlb_flushall_shift = 5; | 751 | tlb_flushall_shift = 5; |
| 746 | 752 | ||
| 747 | if (c->x86 <= 0x11) | 753 | if (c->x86 <= 0x11) |
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d0e910da16c5..92dfec986a48 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c | |||
| @@ -107,53 +107,17 @@ static void __init check_hlt(void) | |||
| 107 | } | 107 | } |
| 108 | 108 | ||
| 109 | /* | 109 | /* |
| 110 | * Most 386 processors have a bug where a POPAD can lock the | ||
| 111 | * machine even from user space. | ||
| 112 | */ | ||
| 113 | |||
| 114 | static void __init check_popad(void) | ||
| 115 | { | ||
| 116 | #ifndef CONFIG_X86_POPAD_OK | ||
| 117 | int res, inp = (int) &res; | ||
| 118 | |||
| 119 | pr_info("Checking for popad bug... "); | ||
| 120 | __asm__ __volatile__( | ||
| 121 | "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " | ||
| 122 | : "=&a" (res) | ||
| 123 | : "d" (inp) | ||
| 124 | : "ecx", "edi"); | ||
| 125 | /* | ||
| 126 | * If this fails, it means that any user program may lock the | ||
| 127 | * CPU hard. Too bad. | ||
| 128 | */ | ||
| 129 | if (res != 12345678) | ||
| 130 | pr_cont("Buggy\n"); | ||
| 131 | else | ||
| 132 | pr_cont("OK\n"); | ||
| 133 | #endif | ||
| 134 | } | ||
| 135 | |||
| 136 | /* | ||
| 137 | * Check whether we are able to run this kernel safely on SMP. | 110 | * Check whether we are able to run this kernel safely on SMP. |
| 138 | * | 111 | * |
| 139 | * - In order to run on a i386, we need to be compiled for i386 | 112 | * - i386 is no longer supported. |
| 140 | * (for due to lack of "invlpg" and working WP on a i386) | ||
| 141 | * - In order to run on anything without a TSC, we need to be | 113 | * - In order to run on anything without a TSC, we need to be |
| 142 | * compiled for a i486. | 114 | * compiled for a i486. |
| 143 | */ | 115 | */ |
| 144 | 116 | ||
| 145 | static void __init check_config(void) | 117 | static void __init check_config(void) |
| 146 | { | 118 | { |
| 147 | /* | 119 | if (boot_cpu_data.x86 < 4) |
| 148 | * We'd better not be a i386 if we're configured to use some | ||
| 149 | * i486+ only features! (WP works in supervisor mode and the | ||
| 150 | * new "invlpg" and "bswap" instructions) | ||
| 151 | */ | ||
| 152 | #if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \ | ||
| 153 | defined(CONFIG_X86_BSWAP) | ||
| 154 | if (boot_cpu_data.x86 == 3) | ||
| 155 | panic("Kernel requires i486+ for 'invlpg' and other features"); | 120 | panic("Kernel requires i486+ for 'invlpg' and other features"); |
| 156 | #endif | ||
| 157 | } | 121 | } |
| 158 | 122 | ||
| 159 | 123 | ||
| @@ -166,7 +130,6 @@ void __init check_bugs(void) | |||
| 166 | #endif | 130 | #endif |
| 167 | check_config(); | 131 | check_config(); |
| 168 | check_hlt(); | 132 | check_hlt(); |
| 169 | check_popad(); | ||
| 170 | init_utsname()->machine[1] = | 133 | init_utsname()->machine[1] = |
| 171 | '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); | 134 | '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); |
| 172 | alternative_instructions(); | 135 | alternative_instructions(); |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7505f7b13e71..9c3ab43a6954 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
| @@ -1173,15 +1173,6 @@ DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); | |||
| 1173 | DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); | 1173 | DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); |
| 1174 | #endif | 1174 | #endif |
| 1175 | 1175 | ||
| 1176 | /* Make sure %fs and %gs are initialized properly in idle threads */ | ||
| 1177 | struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) | ||
| 1178 | { | ||
| 1179 | memset(regs, 0, sizeof(struct pt_regs)); | ||
| 1180 | regs->fs = __KERNEL_PERCPU; | ||
| 1181 | regs->gs = __KERNEL_STACK_CANARY; | ||
| 1182 | |||
| 1183 | return regs; | ||
| 1184 | } | ||
| 1185 | #endif /* CONFIG_X86_64 */ | 1176 | #endif /* CONFIG_X86_64 */ |
| 1186 | 1177 | ||
| 1187 | /* | 1178 | /* |
| @@ -1237,7 +1228,7 @@ void __cpuinit cpu_init(void) | |||
| 1237 | oist = &per_cpu(orig_ist, cpu); | 1228 | oist = &per_cpu(orig_ist, cpu); |
| 1238 | 1229 | ||
| 1239 | #ifdef CONFIG_NUMA | 1230 | #ifdef CONFIG_NUMA |
| 1240 | if (cpu != 0 && this_cpu_read(numa_node) == 0 && | 1231 | if (this_cpu_read(numa_node) == 0 && |
| 1241 | early_cpu_to_node(cpu) != NUMA_NO_NODE) | 1232 | early_cpu_to_node(cpu) != NUMA_NO_NODE) |
| 1242 | set_numa_node(early_cpu_to_node(cpu)); | 1233 | set_numa_node(early_cpu_to_node(cpu)); |
| 1243 | #endif | 1234 | #endif |
| @@ -1269,8 +1260,7 @@ void __cpuinit cpu_init(void) | |||
| 1269 | barrier(); | 1260 | barrier(); |
| 1270 | 1261 | ||
| 1271 | x86_configure_nx(); | 1262 | x86_configure_nx(); |
| 1272 | if (cpu != 0) | 1263 | enable_x2apic(); |
| 1273 | enable_x2apic(); | ||
| 1274 | 1264 | ||
| 1275 | /* | 1265 | /* |
| 1276 | * set up and load the per-CPU TSS | 1266 | * set up and load the per-CPU TSS |
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 198e019a531a..fcaabd0432c5 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
| @@ -612,10 +612,6 @@ static void __cpuinit intel_tlb_lookup(const unsigned char desc) | |||
| 612 | 612 | ||
| 613 | static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) | 613 | static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) |
| 614 | { | 614 | { |
| 615 | if (!cpu_has_invlpg) { | ||
| 616 | tlb_flushall_shift = -1; | ||
| 617 | return; | ||
| 618 | } | ||
| 619 | switch ((c->x86 << 8) + c->x86_model) { | 615 | switch ((c->x86 << 8) + c->x86_model) { |
| 620 | case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ | 616 | case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ |
| 621 | case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ | 617 | case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ |
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 93c5451bdd52..fe9edec6698a 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c | |||
| @@ -538,7 +538,11 @@ __cpuinit cpuid4_cache_lookup_regs(int index, | |||
| 538 | unsigned edx; | 538 | unsigned edx; |
| 539 | 539 | ||
| 540 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { | 540 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { |
| 541 | amd_cpuid4(index, &eax, &ebx, &ecx); | 541 | if (cpu_has_topoext) |
| 542 | cpuid_count(0x8000001d, index, &eax.full, | ||
| 543 | &ebx.full, &ecx.full, &edx); | ||
| 544 | else | ||
| 545 | amd_cpuid4(index, &eax, &ebx, &ecx); | ||
| 542 | amd_init_l3_cache(this_leaf, index); | 546 | amd_init_l3_cache(this_leaf, index); |
| 543 | } else { | 547 | } else { |
| 544 | cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); | 548 | cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); |
| @@ -557,21 +561,39 @@ __cpuinit cpuid4_cache_lookup_regs(int index, | |||
| 557 | return 0; | 561 | return 0; |
| 558 | } | 562 | } |
| 559 | 563 | ||
| 560 | static int __cpuinit find_num_cache_leaves(void) | 564 | static int __cpuinit find_num_cache_leaves(struct cpuinfo_x86 *c) |
| 561 | { | 565 | { |
| 562 | unsigned int eax, ebx, ecx, edx; | 566 | unsigned int eax, ebx, ecx, edx, op; |
| 563 | union _cpuid4_leaf_eax cache_eax; | 567 | union _cpuid4_leaf_eax cache_eax; |
| 564 | int i = -1; | 568 | int i = -1; |
| 565 | 569 | ||
| 570 | if (c->x86_vendor == X86_VENDOR_AMD) | ||
| 571 | op = 0x8000001d; | ||
| 572 | else | ||
| 573 | op = 4; | ||
| 574 | |||
| 566 | do { | 575 | do { |
| 567 | ++i; | 576 | ++i; |
| 568 | /* Do cpuid(4) loop to find out num_cache_leaves */ | 577 | /* Do cpuid(op) loop to find out num_cache_leaves */ |
| 569 | cpuid_count(4, i, &eax, &ebx, &ecx, &edx); | 578 | cpuid_count(op, i, &eax, &ebx, &ecx, &edx); |
| 570 | cache_eax.full = eax; | 579 | cache_eax.full = eax; |
| 571 | } while (cache_eax.split.type != CACHE_TYPE_NULL); | 580 | } while (cache_eax.split.type != CACHE_TYPE_NULL); |
| 572 | return i; | 581 | return i; |
| 573 | } | 582 | } |
| 574 | 583 | ||
| 584 | void __cpuinit init_amd_cacheinfo(struct cpuinfo_x86 *c) | ||
| 585 | { | ||
| 586 | |||
| 587 | if (cpu_has_topoext) { | ||
| 588 | num_cache_leaves = find_num_cache_leaves(c); | ||
| 589 | } else if (c->extended_cpuid_level >= 0x80000006) { | ||
| 590 | if (cpuid_edx(0x80000006) & 0xf000) | ||
| 591 | num_cache_leaves = 4; | ||
| 592 | else | ||
| 593 | num_cache_leaves = 3; | ||
| 594 | } | ||
| 595 | } | ||
| 596 | |||
| 575 | unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) | 597 | unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) |
| 576 | { | 598 | { |
| 577 | /* Cache sizes */ | 599 | /* Cache sizes */ |
| @@ -588,7 +610,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) | |||
| 588 | 610 | ||
| 589 | if (is_initialized == 0) { | 611 | if (is_initialized == 0) { |
| 590 | /* Init num_cache_leaves from boot CPU */ | 612 | /* Init num_cache_leaves from boot CPU */ |
| 591 | num_cache_leaves = find_num_cache_leaves(); | 613 | num_cache_leaves = find_num_cache_leaves(c); |
| 592 | is_initialized++; | 614 | is_initialized++; |
| 593 | } | 615 | } |
| 594 | 616 | ||
| @@ -728,37 +750,50 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); | |||
| 728 | static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) | 750 | static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) |
| 729 | { | 751 | { |
| 730 | struct _cpuid4_info *this_leaf; | 752 | struct _cpuid4_info *this_leaf; |
| 731 | int ret, i, sibling; | 753 | int i, sibling; |
| 732 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
| 733 | 754 | ||
| 734 | ret = 0; | 755 | if (cpu_has_topoext) { |
| 735 | if (index == 3) { | 756 | unsigned int apicid, nshared, first, last; |
| 736 | ret = 1; | 757 | |
| 737 | for_each_cpu(i, cpu_llc_shared_mask(cpu)) { | 758 | if (!per_cpu(ici_cpuid4_info, cpu)) |
| 759 | return 0; | ||
| 760 | |||
| 761 | this_leaf = CPUID4_INFO_IDX(cpu, index); | ||
| 762 | nshared = this_leaf->base.eax.split.num_threads_sharing + 1; | ||
| 763 | apicid = cpu_data(cpu).apicid; | ||
| 764 | first = apicid - (apicid % nshared); | ||
| 765 | last = first + nshared - 1; | ||
| 766 | |||
| 767 | for_each_online_cpu(i) { | ||
| 768 | apicid = cpu_data(i).apicid; | ||
| 769 | if ((apicid < first) || (apicid > last)) | ||
| 770 | continue; | ||
| 738 | if (!per_cpu(ici_cpuid4_info, i)) | 771 | if (!per_cpu(ici_cpuid4_info, i)) |
| 739 | continue; | 772 | continue; |
| 740 | this_leaf = CPUID4_INFO_IDX(i, index); | 773 | this_leaf = CPUID4_INFO_IDX(i, index); |
| 741 | for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { | 774 | |
| 742 | if (!cpu_online(sibling)) | 775 | for_each_online_cpu(sibling) { |
| 776 | apicid = cpu_data(sibling).apicid; | ||
| 777 | if ((apicid < first) || (apicid > last)) | ||
| 743 | continue; | 778 | continue; |
| 744 | set_bit(sibling, this_leaf->shared_cpu_map); | 779 | set_bit(sibling, this_leaf->shared_cpu_map); |
| 745 | } | 780 | } |
| 746 | } | 781 | } |
| 747 | } else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) { | 782 | } else if (index == 3) { |
| 748 | ret = 1; | 783 | for_each_cpu(i, cpu_llc_shared_mask(cpu)) { |
| 749 | for_each_cpu(i, cpu_sibling_mask(cpu)) { | ||
| 750 | if (!per_cpu(ici_cpuid4_info, i)) | 784 | if (!per_cpu(ici_cpuid4_info, i)) |
| 751 | continue; | 785 | continue; |
| 752 | this_leaf = CPUID4_INFO_IDX(i, index); | 786 | this_leaf = CPUID4_INFO_IDX(i, index); |
| 753 | for_each_cpu(sibling, cpu_sibling_mask(cpu)) { | 787 | for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { |
| 754 | if (!cpu_online(sibling)) | 788 | if (!cpu_online(sibling)) |
| 755 | continue; | 789 | continue; |
| 756 | set_bit(sibling, this_leaf->shared_cpu_map); | 790 | set_bit(sibling, this_leaf->shared_cpu_map); |
| 757 | } | 791 | } |
| 758 | } | 792 | } |
| 759 | } | 793 | } else |
| 794 | return 0; | ||
| 760 | 795 | ||
| 761 | return ret; | 796 | return 1; |
| 762 | } | 797 | } |
| 763 | 798 | ||
| 764 | static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) | 799 | static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 6a05c1d327a9..5b7d4fa5d3b7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h | |||
| @@ -24,8 +24,6 @@ struct mce_bank { | |||
| 24 | int mce_severity(struct mce *a, int tolerant, char **msg); | 24 | int mce_severity(struct mce *a, int tolerant, char **msg); |
| 25 | struct dentry *mce_get_debugfs_dir(void); | 25 | struct dentry *mce_get_debugfs_dir(void); |
| 26 | 26 | ||
| 27 | extern int mce_ser; | ||
| 28 | |||
| 29 | extern struct mce_bank *mce_banks; | 27 | extern struct mce_bank *mce_banks; |
| 30 | 28 | ||
| 31 | #ifdef CONFIG_X86_MCE_INTEL | 29 | #ifdef CONFIG_X86_MCE_INTEL |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 13017626f9a8..beb1f1689e52 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c | |||
| @@ -193,9 +193,9 @@ int mce_severity(struct mce *m, int tolerant, char **msg) | |||
| 193 | continue; | 193 | continue; |
| 194 | if ((m->mcgstatus & s->mcgmask) != s->mcgres) | 194 | if ((m->mcgstatus & s->mcgmask) != s->mcgres) |
| 195 | continue; | 195 | continue; |
| 196 | if (s->ser == SER_REQUIRED && !mce_ser) | 196 | if (s->ser == SER_REQUIRED && !mca_cfg.ser) |
| 197 | continue; | 197 | continue; |
| 198 | if (s->ser == NO_SER && mce_ser) | 198 | if (s->ser == NO_SER && mca_cfg.ser) |
| 199 | continue; | 199 | continue; |
| 200 | if (s->context && ctx != s->context) | 200 | if (s->context && ctx != s->context) |
| 201 | continue; | 201 | continue; |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 46cbf8689692..80dbda84f1c3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
| @@ -58,34 +58,26 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); | |||
| 58 | #define CREATE_TRACE_POINTS | 58 | #define CREATE_TRACE_POINTS |
| 59 | #include <trace/events/mce.h> | 59 | #include <trace/events/mce.h> |
| 60 | 60 | ||
| 61 | int mce_disabled __read_mostly; | ||
| 62 | |||
| 63 | #define SPINUNIT 100 /* 100ns */ | 61 | #define SPINUNIT 100 /* 100ns */ |
| 64 | 62 | ||
| 65 | atomic_t mce_entry; | 63 | atomic_t mce_entry; |
| 66 | 64 | ||
| 67 | DEFINE_PER_CPU(unsigned, mce_exception_count); | 65 | DEFINE_PER_CPU(unsigned, mce_exception_count); |
| 68 | 66 | ||
| 69 | /* | 67 | struct mce_bank *mce_banks __read_mostly; |
| 70 | * Tolerant levels: | 68 | |
| 71 | * 0: always panic on uncorrected errors, log corrected errors | 69 | struct mca_config mca_cfg __read_mostly = { |
| 72 | * 1: panic or SIGBUS on uncorrected errors, log corrected errors | 70 | .bootlog = -1, |
| 73 | * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors | 71 | /* |
| 74 | * 3: never panic or SIGBUS, log all errors (for testing only) | 72 | * Tolerant levels: |
| 75 | */ | 73 | * 0: always panic on uncorrected errors, log corrected errors |
| 76 | static int tolerant __read_mostly = 1; | 74 | * 1: panic or SIGBUS on uncorrected errors, log corrected errors |
| 77 | static int banks __read_mostly; | 75 | * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors |
| 78 | static int rip_msr __read_mostly; | 76 | * 3: never panic or SIGBUS, log all errors (for testing only) |
| 79 | static int mce_bootlog __read_mostly = -1; | 77 | */ |
| 80 | static int monarch_timeout __read_mostly = -1; | 78 | .tolerant = 1, |
| 81 | static int mce_panic_timeout __read_mostly; | 79 | .monarch_timeout = -1 |
| 82 | static int mce_dont_log_ce __read_mostly; | 80 | }; |
| 83 | int mce_cmci_disabled __read_mostly; | ||
| 84 | int mce_ignore_ce __read_mostly; | ||
| 85 | int mce_ser __read_mostly; | ||
| 86 | int mce_bios_cmci_threshold __read_mostly; | ||
| 87 | |||
| 88 | struct mce_bank *mce_banks __read_mostly; | ||
| 89 | 81 | ||
| 90 | /* User mode helper program triggered by machine check event */ | 82 | /* User mode helper program triggered by machine check event */ |
| 91 | static unsigned long mce_need_notify; | 83 | static unsigned long mce_need_notify; |
| @@ -302,7 +294,7 @@ static void wait_for_panic(void) | |||
| 302 | while (timeout-- > 0) | 294 | while (timeout-- > 0) |
| 303 | udelay(1); | 295 | udelay(1); |
| 304 | if (panic_timeout == 0) | 296 | if (panic_timeout == 0) |
| 305 | panic_timeout = mce_panic_timeout; | 297 | panic_timeout = mca_cfg.panic_timeout; |
| 306 | panic("Panicing machine check CPU died"); | 298 | panic("Panicing machine check CPU died"); |
| 307 | } | 299 | } |
| 308 | 300 | ||
| @@ -360,7 +352,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) | |||
| 360 | pr_emerg(HW_ERR "Machine check: %s\n", exp); | 352 | pr_emerg(HW_ERR "Machine check: %s\n", exp); |
| 361 | if (!fake_panic) { | 353 | if (!fake_panic) { |
| 362 | if (panic_timeout == 0) | 354 | if (panic_timeout == 0) |
| 363 | panic_timeout = mce_panic_timeout; | 355 | panic_timeout = mca_cfg.panic_timeout; |
| 364 | panic(msg); | 356 | panic(msg); |
| 365 | } else | 357 | } else |
| 366 | pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); | 358 | pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); |
| @@ -372,7 +364,7 @@ static int msr_to_offset(u32 msr) | |||
| 372 | { | 364 | { |
| 373 | unsigned bank = __this_cpu_read(injectm.bank); | 365 | unsigned bank = __this_cpu_read(injectm.bank); |
| 374 | 366 | ||
| 375 | if (msr == rip_msr) | 367 | if (msr == mca_cfg.rip_msr) |
| 376 | return offsetof(struct mce, ip); | 368 | return offsetof(struct mce, ip); |
| 377 | if (msr == MSR_IA32_MCx_STATUS(bank)) | 369 | if (msr == MSR_IA32_MCx_STATUS(bank)) |
| 378 | return offsetof(struct mce, status); | 370 | return offsetof(struct mce, status); |
| @@ -451,8 +443,8 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) | |||
| 451 | m->cs |= 3; | 443 | m->cs |= 3; |
| 452 | } | 444 | } |
| 453 | /* Use accurate RIP reporting if available. */ | 445 | /* Use accurate RIP reporting if available. */ |
| 454 | if (rip_msr) | 446 | if (mca_cfg.rip_msr) |
| 455 | m->ip = mce_rdmsrl(rip_msr); | 447 | m->ip = mce_rdmsrl(mca_cfg.rip_msr); |
| 456 | } | 448 | } |
| 457 | } | 449 | } |
| 458 | 450 | ||
| @@ -513,7 +505,7 @@ static int mce_ring_add(unsigned long pfn) | |||
| 513 | 505 | ||
| 514 | int mce_available(struct cpuinfo_x86 *c) | 506 | int mce_available(struct cpuinfo_x86 *c) |
| 515 | { | 507 | { |
| 516 | if (mce_disabled) | 508 | if (mca_cfg.disabled) |
| 517 | return 0; | 509 | return 0; |
| 518 | return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); | 510 | return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); |
| 519 | } | 511 | } |
| @@ -565,7 +557,7 @@ static void mce_read_aux(struct mce *m, int i) | |||
| 565 | /* | 557 | /* |
| 566 | * Mask the reported address by the reported granularity. | 558 | * Mask the reported address by the reported granularity. |
| 567 | */ | 559 | */ |
| 568 | if (mce_ser && (m->status & MCI_STATUS_MISCV)) { | 560 | if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) { |
| 569 | u8 shift = MCI_MISC_ADDR_LSB(m->misc); | 561 | u8 shift = MCI_MISC_ADDR_LSB(m->misc); |
| 570 | m->addr >>= shift; | 562 | m->addr >>= shift; |
| 571 | m->addr <<= shift; | 563 | m->addr <<= shift; |
| @@ -599,7 +591,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
| 599 | 591 | ||
| 600 | mce_gather_info(&m, NULL); | 592 | mce_gather_info(&m, NULL); |
| 601 | 593 | ||
| 602 | for (i = 0; i < banks; i++) { | 594 | for (i = 0; i < mca_cfg.banks; i++) { |
| 603 | if (!mce_banks[i].ctl || !test_bit(i, *b)) | 595 | if (!mce_banks[i].ctl || !test_bit(i, *b)) |
| 604 | continue; | 596 | continue; |
| 605 | 597 | ||
| @@ -620,7 +612,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
| 620 | * TBD do the same check for MCI_STATUS_EN here? | 612 | * TBD do the same check for MCI_STATUS_EN here? |
| 621 | */ | 613 | */ |
| 622 | if (!(flags & MCP_UC) && | 614 | if (!(flags & MCP_UC) && |
| 623 | (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) | 615 | (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) |
| 624 | continue; | 616 | continue; |
| 625 | 617 | ||
| 626 | mce_read_aux(&m, i); | 618 | mce_read_aux(&m, i); |
| @@ -631,7 +623,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
| 631 | * Don't get the IP here because it's unlikely to | 623 | * Don't get the IP here because it's unlikely to |
| 632 | * have anything to do with the actual error location. | 624 | * have anything to do with the actual error location. |
| 633 | */ | 625 | */ |
| 634 | if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) | 626 | if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) |
| 635 | mce_log(&m); | 627 | mce_log(&m); |
| 636 | 628 | ||
| 637 | /* | 629 | /* |
| @@ -658,14 +650,14 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, | |||
| 658 | { | 650 | { |
| 659 | int i, ret = 0; | 651 | int i, ret = 0; |
| 660 | 652 | ||
| 661 | for (i = 0; i < banks; i++) { | 653 | for (i = 0; i < mca_cfg.banks; i++) { |
| 662 | m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); | 654 | m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); |
| 663 | if (m->status & MCI_STATUS_VAL) { | 655 | if (m->status & MCI_STATUS_VAL) { |
| 664 | __set_bit(i, validp); | 656 | __set_bit(i, validp); |
| 665 | if (quirk_no_way_out) | 657 | if (quirk_no_way_out) |
| 666 | quirk_no_way_out(i, m, regs); | 658 | quirk_no_way_out(i, m, regs); |
| 667 | } | 659 | } |
| 668 | if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) | 660 | if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) |
| 669 | ret = 1; | 661 | ret = 1; |
| 670 | } | 662 | } |
| 671 | return ret; | 663 | return ret; |
| @@ -696,11 +688,11 @@ static int mce_timed_out(u64 *t) | |||
| 696 | rmb(); | 688 | rmb(); |
| 697 | if (atomic_read(&mce_paniced)) | 689 | if (atomic_read(&mce_paniced)) |
| 698 | wait_for_panic(); | 690 | wait_for_panic(); |
| 699 | if (!monarch_timeout) | 691 | if (!mca_cfg.monarch_timeout) |
| 700 | goto out; | 692 | goto out; |
| 701 | if ((s64)*t < SPINUNIT) { | 693 | if ((s64)*t < SPINUNIT) { |
| 702 | /* CHECKME: Make panic default for 1 too? */ | 694 | /* CHECKME: Make panic default for 1 too? */ |
| 703 | if (tolerant < 1) | 695 | if (mca_cfg.tolerant < 1) |
| 704 | mce_panic("Timeout synchronizing machine check over CPUs", | 696 | mce_panic("Timeout synchronizing machine check over CPUs", |
| 705 | NULL, NULL); | 697 | NULL, NULL); |
| 706 | cpu_missing = 1; | 698 | cpu_missing = 1; |
| @@ -750,7 +742,8 @@ static void mce_reign(void) | |||
| 750 | * Grade the severity of the errors of all the CPUs. | 742 | * Grade the severity of the errors of all the CPUs. |
| 751 | */ | 743 | */ |
| 752 | for_each_possible_cpu(cpu) { | 744 | for_each_possible_cpu(cpu) { |
| 753 | int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, | 745 | int severity = mce_severity(&per_cpu(mces_seen, cpu), |
| 746 | mca_cfg.tolerant, | ||
| 754 | &nmsg); | 747 | &nmsg); |
| 755 | if (severity > global_worst) { | 748 | if (severity > global_worst) { |
| 756 | msg = nmsg; | 749 | msg = nmsg; |
| @@ -764,7 +757,7 @@ static void mce_reign(void) | |||
| 764 | * This dumps all the mces in the log buffer and stops the | 757 | * This dumps all the mces in the log buffer and stops the |
| 765 | * other CPUs. | 758 | * other CPUs. |
| 766 | */ | 759 | */ |
| 767 | if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) | 760 | if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) |
| 768 | mce_panic("Fatal Machine check", m, msg); | 761 | mce_panic("Fatal Machine check", m, msg); |
| 769 | 762 | ||
| 770 | /* | 763 | /* |
| @@ -777,7 +770,7 @@ static void mce_reign(void) | |||
| 777 | * No machine check event found. Must be some external | 770 | * No machine check event found. Must be some external |
| 778 | * source or one CPU is hung. Panic. | 771 | * source or one CPU is hung. Panic. |
| 779 | */ | 772 | */ |
| 780 | if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) | 773 | if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) |
| 781 | mce_panic("Machine check from unknown source", NULL, NULL); | 774 | mce_panic("Machine check from unknown source", NULL, NULL); |
| 782 | 775 | ||
| 783 | /* | 776 | /* |
| @@ -801,7 +794,7 @@ static int mce_start(int *no_way_out) | |||
| 801 | { | 794 | { |
| 802 | int order; | 795 | int order; |
| 803 | int cpus = num_online_cpus(); | 796 | int cpus = num_online_cpus(); |
| 804 | u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; | 797 | u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; |
| 805 | 798 | ||
| 806 | if (!timeout) | 799 | if (!timeout) |
| 807 | return -1; | 800 | return -1; |
| @@ -865,7 +858,7 @@ static int mce_start(int *no_way_out) | |||
| 865 | static int mce_end(int order) | 858 | static int mce_end(int order) |
| 866 | { | 859 | { |
| 867 | int ret = -1; | 860 | int ret = -1; |
| 868 | u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; | 861 | u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; |
| 869 | 862 | ||
| 870 | if (!timeout) | 863 | if (!timeout) |
| 871 | goto reset; | 864 | goto reset; |
| @@ -946,7 +939,7 @@ static void mce_clear_state(unsigned long *toclear) | |||
| 946 | { | 939 | { |
| 947 | int i; | 940 | int i; |
| 948 | 941 | ||
| 949 | for (i = 0; i < banks; i++) { | 942 | for (i = 0; i < mca_cfg.banks; i++) { |
| 950 | if (test_bit(i, toclear)) | 943 | if (test_bit(i, toclear)) |
| 951 | mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); | 944 | mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); |
| 952 | } | 945 | } |
| @@ -1011,6 +1004,7 @@ static void mce_clear_info(struct mce_info *mi) | |||
| 1011 | */ | 1004 | */ |
| 1012 | void do_machine_check(struct pt_regs *regs, long error_code) | 1005 | void do_machine_check(struct pt_regs *regs, long error_code) |
| 1013 | { | 1006 | { |
| 1007 | struct mca_config *cfg = &mca_cfg; | ||
| 1014 | struct mce m, *final; | 1008 | struct mce m, *final; |
| 1015 | int i; | 1009 | int i; |
| 1016 | int worst = 0; | 1010 | int worst = 0; |
| @@ -1022,7 +1016,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
| 1022 | int order; | 1016 | int order; |
| 1023 | /* | 1017 | /* |
| 1024 | * If no_way_out gets set, there is no safe way to recover from this | 1018 | * If no_way_out gets set, there is no safe way to recover from this |
| 1025 | * MCE. If tolerant is cranked up, we'll try anyway. | 1019 | * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. |
| 1026 | */ | 1020 | */ |
| 1027 | int no_way_out = 0; | 1021 | int no_way_out = 0; |
| 1028 | /* | 1022 | /* |
| @@ -1038,7 +1032,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
| 1038 | 1032 | ||
| 1039 | this_cpu_inc(mce_exception_count); | 1033 | this_cpu_inc(mce_exception_count); |
| 1040 | 1034 | ||
| 1041 | if (!banks) | 1035 | if (!cfg->banks) |
| 1042 | goto out; | 1036 | goto out; |
| 1043 | 1037 | ||
| 1044 | mce_gather_info(&m, regs); | 1038 | mce_gather_info(&m, regs); |
| @@ -1065,7 +1059,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
| 1065 | * because the first one to see it will clear it. | 1059 | * because the first one to see it will clear it. |
| 1066 | */ | 1060 | */ |
| 1067 | order = mce_start(&no_way_out); | 1061 | order = mce_start(&no_way_out); |
| 1068 | for (i = 0; i < banks; i++) { | 1062 | for (i = 0; i < cfg->banks; i++) { |
| 1069 | __clear_bit(i, toclear); | 1063 | __clear_bit(i, toclear); |
| 1070 | if (!test_bit(i, valid_banks)) | 1064 | if (!test_bit(i, valid_banks)) |
| 1071 | continue; | 1065 | continue; |
| @@ -1084,7 +1078,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
| 1084 | * Non uncorrected or non signaled errors are handled by | 1078 | * Non uncorrected or non signaled errors are handled by |
| 1085 | * machine_check_poll. Leave them alone, unless this panics. | 1079 | * machine_check_poll. Leave them alone, unless this panics. |
| 1086 | */ | 1080 | */ |
| 1087 | if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && | 1081 | if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && |
| 1088 | !no_way_out) | 1082 | !no_way_out) |
| 1089 | continue; | 1083 | continue; |
| 1090 | 1084 | ||
| @@ -1093,7 +1087,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
| 1093 | */ | 1087 | */ |
| 1094 | add_taint(TAINT_MACHINE_CHECK); | 1088 | add_taint(TAINT_MACHINE_CHECK); |
| 1095 | 1089 | ||
| 1096 | severity = mce_severity(&m, tolerant, NULL); | 1090 | severity = mce_severity(&m, cfg->tolerant, NULL); |
| 1097 | 1091 | ||
| 1098 | /* | 1092 | /* |
| 1099 | * When machine check was for corrected handler don't touch, | 1093 | * When machine check was for corrected handler don't touch, |
| @@ -1117,7 +1111,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
| 1117 | * When the ring overflows we just ignore the AO error. | 1111 | * When the ring overflows we just ignore the AO error. |
| 1118 | * RED-PEN add some logging mechanism when | 1112 | * RED-PEN add some logging mechanism when |
| 1119 | * usable_address or mce_add_ring fails. | 1113 | * usable_address or mce_add_ring fails. |
| 1120 | * RED-PEN don't ignore overflow for tolerant == 0 | 1114 | * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0 |
| 1121 | */ | 1115 | */ |
| 1122 | if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) | 1116 | if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) |
| 1123 | mce_ring_add(m.addr >> PAGE_SHIFT); | 1117 | mce_ring_add(m.addr >> PAGE_SHIFT); |
| @@ -1149,7 +1143,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
| 1149 | * issues we try to recover, or limit damage to the current | 1143 | * issues we try to recover, or limit damage to the current |
| 1150 | * process. | 1144 | * process. |
| 1151 | */ | 1145 | */ |
| 1152 | if (tolerant < 3) { | 1146 | if (cfg->tolerant < 3) { |
| 1153 | if (no_way_out) | 1147 | if (no_way_out) |
| 1154 | mce_panic("Fatal machine check on current CPU", &m, msg); | 1148 | mce_panic("Fatal machine check on current CPU", &m, msg); |
| 1155 | if (worst == MCE_AR_SEVERITY) { | 1149 | if (worst == MCE_AR_SEVERITY) { |
| @@ -1377,11 +1371,13 @@ EXPORT_SYMBOL_GPL(mce_notify_irq); | |||
| 1377 | static int __cpuinit __mcheck_cpu_mce_banks_init(void) | 1371 | static int __cpuinit __mcheck_cpu_mce_banks_init(void) |
| 1378 | { | 1372 | { |
| 1379 | int i; | 1373 | int i; |
| 1374 | u8 num_banks = mca_cfg.banks; | ||
| 1380 | 1375 | ||
| 1381 | mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); | 1376 | mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL); |
| 1382 | if (!mce_banks) | 1377 | if (!mce_banks) |
| 1383 | return -ENOMEM; | 1378 | return -ENOMEM; |
| 1384 | for (i = 0; i < banks; i++) { | 1379 | |
| 1380 | for (i = 0; i < num_banks; i++) { | ||
| 1385 | struct mce_bank *b = &mce_banks[i]; | 1381 | struct mce_bank *b = &mce_banks[i]; |
| 1386 | 1382 | ||
| 1387 | b->ctl = -1ULL; | 1383 | b->ctl = -1ULL; |
| @@ -1401,7 +1397,7 @@ static int __cpuinit __mcheck_cpu_cap_init(void) | |||
| 1401 | rdmsrl(MSR_IA32_MCG_CAP, cap); | 1397 | rdmsrl(MSR_IA32_MCG_CAP, cap); |
| 1402 | 1398 | ||
| 1403 | b = cap & MCG_BANKCNT_MASK; | 1399 | b = cap & MCG_BANKCNT_MASK; |
| 1404 | if (!banks) | 1400 | if (!mca_cfg.banks) |
| 1405 | pr_info("CPU supports %d MCE banks\n", b); | 1401 | pr_info("CPU supports %d MCE banks\n", b); |
| 1406 | 1402 | ||
| 1407 | if (b > MAX_NR_BANKS) { | 1403 | if (b > MAX_NR_BANKS) { |
| @@ -1411,8 +1407,9 @@ static int __cpuinit __mcheck_cpu_cap_init(void) | |||
| 1411 | } | 1407 | } |
| 1412 | 1408 | ||
| 1413 | /* Don't support asymmetric configurations today */ | 1409 | /* Don't support asymmetric configurations today */ |
| 1414 | WARN_ON(banks != 0 && b != banks); | 1410 | WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); |
| 1415 | banks = b; | 1411 | mca_cfg.banks = b; |
| 1412 | |||
| 1416 | if (!mce_banks) { | 1413 | if (!mce_banks) { |
| 1417 | int err = __mcheck_cpu_mce_banks_init(); | 1414 | int err = __mcheck_cpu_mce_banks_init(); |
| 1418 | 1415 | ||
| @@ -1422,25 +1419,29 @@ static int __cpuinit __mcheck_cpu_cap_init(void) | |||
| 1422 | 1419 | ||
| 1423 | /* Use accurate RIP reporting if available. */ | 1420 | /* Use accurate RIP reporting if available. */ |
| 1424 | if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) | 1421 | if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) |
| 1425 | rip_msr = MSR_IA32_MCG_EIP; | 1422 | mca_cfg.rip_msr = MSR_IA32_MCG_EIP; |
| 1426 | 1423 | ||
| 1427 | if (cap & MCG_SER_P) | 1424 | if (cap & MCG_SER_P) |
| 1428 | mce_ser = 1; | 1425 | mca_cfg.ser = true; |
| 1429 | 1426 | ||
| 1430 | return 0; | 1427 | return 0; |
| 1431 | } | 1428 | } |
| 1432 | 1429 | ||
| 1433 | static void __mcheck_cpu_init_generic(void) | 1430 | static void __mcheck_cpu_init_generic(void) |
| 1434 | { | 1431 | { |
| 1432 | enum mcp_flags m_fl = 0; | ||
| 1435 | mce_banks_t all_banks; | 1433 | mce_banks_t all_banks; |
| 1436 | u64 cap; | 1434 | u64 cap; |
| 1437 | int i; | 1435 | int i; |
| 1438 | 1436 | ||
| 1437 | if (!mca_cfg.bootlog) | ||
| 1438 | m_fl = MCP_DONTLOG; | ||
| 1439 | |||
| 1439 | /* | 1440 | /* |
| 1440 | * Log the machine checks left over from the previous reset. | 1441 | * Log the machine checks left over from the previous reset. |
| 1441 | */ | 1442 | */ |
| 1442 | bitmap_fill(all_banks, MAX_NR_BANKS); | 1443 | bitmap_fill(all_banks, MAX_NR_BANKS); |
| 1443 | machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); | 1444 | machine_check_poll(MCP_UC | m_fl, &all_banks); |
| 1444 | 1445 | ||
| 1445 | set_in_cr4(X86_CR4_MCE); | 1446 | set_in_cr4(X86_CR4_MCE); |
| 1446 | 1447 | ||
| @@ -1448,7 +1449,7 @@ static void __mcheck_cpu_init_generic(void) | |||
| 1448 | if (cap & MCG_CTL_P) | 1449 | if (cap & MCG_CTL_P) |
| 1449 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | 1450 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); |
| 1450 | 1451 | ||
| 1451 | for (i = 0; i < banks; i++) { | 1452 | for (i = 0; i < mca_cfg.banks; i++) { |
| 1452 | struct mce_bank *b = &mce_banks[i]; | 1453 | struct mce_bank *b = &mce_banks[i]; |
| 1453 | 1454 | ||
| 1454 | if (!b->init) | 1455 | if (!b->init) |
| @@ -1489,6 +1490,8 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) | |||
| 1489 | /* Add per CPU specific workarounds here */ | 1490 | /* Add per CPU specific workarounds here */ |
| 1490 | static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) | 1491 | static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) |
| 1491 | { | 1492 | { |
| 1493 | struct mca_config *cfg = &mca_cfg; | ||
| 1494 | |||
| 1492 | if (c->x86_vendor == X86_VENDOR_UNKNOWN) { | 1495 | if (c->x86_vendor == X86_VENDOR_UNKNOWN) { |
| 1493 | pr_info("unknown CPU type - not enabling MCE support\n"); | 1496 | pr_info("unknown CPU type - not enabling MCE support\n"); |
| 1494 | return -EOPNOTSUPP; | 1497 | return -EOPNOTSUPP; |
| @@ -1496,7 +1499,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) | |||
| 1496 | 1499 | ||
| 1497 | /* This should be disabled by the BIOS, but isn't always */ | 1500 | /* This should be disabled by the BIOS, but isn't always */ |
| 1498 | if (c->x86_vendor == X86_VENDOR_AMD) { | 1501 | if (c->x86_vendor == X86_VENDOR_AMD) { |
| 1499 | if (c->x86 == 15 && banks > 4) { | 1502 | if (c->x86 == 15 && cfg->banks > 4) { |
| 1500 | /* | 1503 | /* |
| 1501 | * disable GART TBL walk error reporting, which | 1504 | * disable GART TBL walk error reporting, which |
| 1502 | * trips off incorrectly with the IOMMU & 3ware | 1505 | * trips off incorrectly with the IOMMU & 3ware |
| @@ -1504,18 +1507,18 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) | |||
| 1504 | */ | 1507 | */ |
| 1505 | clear_bit(10, (unsigned long *)&mce_banks[4].ctl); | 1508 | clear_bit(10, (unsigned long *)&mce_banks[4].ctl); |
| 1506 | } | 1509 | } |
| 1507 | if (c->x86 <= 17 && mce_bootlog < 0) { | 1510 | if (c->x86 <= 17 && cfg->bootlog < 0) { |
| 1508 | /* | 1511 | /* |
| 1509 | * Lots of broken BIOS around that don't clear them | 1512 | * Lots of broken BIOS around that don't clear them |
| 1510 | * by default and leave crap in there. Don't log: | 1513 | * by default and leave crap in there. Don't log: |
| 1511 | */ | 1514 | */ |
| 1512 | mce_bootlog = 0; | 1515 | cfg->bootlog = 0; |
| 1513 | } | 1516 | } |
| 1514 | /* | 1517 | /* |
| 1515 | * Various K7s with broken bank 0 around. Always disable | 1518 | * Various K7s with broken bank 0 around. Always disable |
| 1516 | * by default. | 1519 | * by default. |
| 1517 | */ | 1520 | */ |
| 1518 | if (c->x86 == 6 && banks > 0) | 1521 | if (c->x86 == 6 && cfg->banks > 0) |
| 1519 | mce_banks[0].ctl = 0; | 1522 | mce_banks[0].ctl = 0; |
| 1520 | 1523 | ||
| 1521 | /* | 1524 | /* |
| @@ -1566,7 +1569,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) | |||
| 1566 | * valid event later, merely don't write CTL0. | 1569 | * valid event later, merely don't write CTL0. |
| 1567 | */ | 1570 | */ |
| 1568 | 1571 | ||
| 1569 | if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) | 1572 | if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) |
| 1570 | mce_banks[0].init = 0; | 1573 | mce_banks[0].init = 0; |
| 1571 | 1574 | ||
| 1572 | /* | 1575 | /* |
| @@ -1574,23 +1577,23 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) | |||
| 1574 | * synchronization with a one second timeout. | 1577 | * synchronization with a one second timeout. |
| 1575 | */ | 1578 | */ |
| 1576 | if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && | 1579 | if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && |
| 1577 | monarch_timeout < 0) | 1580 | cfg->monarch_timeout < 0) |
| 1578 | monarch_timeout = USEC_PER_SEC; | 1581 | cfg->monarch_timeout = USEC_PER_SEC; |
| 1579 | 1582 | ||
| 1580 | /* | 1583 | /* |
| 1581 | * There are also broken BIOSes on some Pentium M and | 1584 | * There are also broken BIOSes on some Pentium M and |
| 1582 | * earlier systems: | 1585 | * earlier systems: |
| 1583 | */ | 1586 | */ |
| 1584 | if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) | 1587 | if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) |
| 1585 | mce_bootlog = 0; | 1588 | cfg->bootlog = 0; |
| 1586 | 1589 | ||
| 1587 | if (c->x86 == 6 && c->x86_model == 45) | 1590 | if (c->x86 == 6 && c->x86_model == 45) |
| 1588 | quirk_no_way_out = quirk_sandybridge_ifu; | 1591 | quirk_no_way_out = quirk_sandybridge_ifu; |
| 1589 | } | 1592 | } |
| 1590 | if (monarch_timeout < 0) | 1593 | if (cfg->monarch_timeout < 0) |
| 1591 | monarch_timeout = 0; | 1594 | cfg->monarch_timeout = 0; |
| 1592 | if (mce_bootlog != 0) | 1595 | if (cfg->bootlog != 0) |
| 1593 | mce_panic_timeout = 30; | 1596 | cfg->panic_timeout = 30; |
| 1594 | 1597 | ||
| 1595 | return 0; | 1598 | return 0; |
| 1596 | } | 1599 | } |
| @@ -1635,7 +1638,7 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t) | |||
| 1635 | 1638 | ||
| 1636 | __this_cpu_write(mce_next_interval, iv); | 1639 | __this_cpu_write(mce_next_interval, iv); |
| 1637 | 1640 | ||
| 1638 | if (mce_ignore_ce || !iv) | 1641 | if (mca_cfg.ignore_ce || !iv) |
| 1639 | return; | 1642 | return; |
| 1640 | 1643 | ||
| 1641 | t->expires = round_jiffies(jiffies + iv); | 1644 | t->expires = round_jiffies(jiffies + iv); |
| @@ -1668,7 +1671,7 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = | |||
| 1668 | */ | 1671 | */ |
| 1669 | void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) | 1672 | void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) |
| 1670 | { | 1673 | { |
| 1671 | if (mce_disabled) | 1674 | if (mca_cfg.disabled) |
| 1672 | return; | 1675 | return; |
| 1673 | 1676 | ||
| 1674 | if (__mcheck_cpu_ancient_init(c)) | 1677 | if (__mcheck_cpu_ancient_init(c)) |
| @@ -1678,7 +1681,7 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) | |||
| 1678 | return; | 1681 | return; |
| 1679 | 1682 | ||
| 1680 | if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { | 1683 | if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { |
| 1681 | mce_disabled = 1; | 1684 | mca_cfg.disabled = true; |
| 1682 | return; | 1685 | return; |
| 1683 | } | 1686 | } |
| 1684 | 1687 | ||
| @@ -1951,6 +1954,8 @@ static struct miscdevice mce_chrdev_device = { | |||
| 1951 | */ | 1954 | */ |
| 1952 | static int __init mcheck_enable(char *str) | 1955 | static int __init mcheck_enable(char *str) |
| 1953 | { | 1956 | { |
| 1957 | struct mca_config *cfg = &mca_cfg; | ||
| 1958 | |||
| 1954 | if (*str == 0) { | 1959 | if (*str == 0) { |
| 1955 | enable_p5_mce(); | 1960 | enable_p5_mce(); |
| 1956 | return 1; | 1961 | return 1; |
| @@ -1958,22 +1963,22 @@ static int __init mcheck_enable(char *str) | |||
| 1958 | if (*str == '=') | 1963 | if (*str == '=') |
| 1959 | str++; | 1964 | str++; |
| 1960 | if (!strcmp(str, "off")) | 1965 | if (!strcmp(str, "off")) |
| 1961 | mce_disabled = 1; | 1966 | cfg->disabled = true; |
| 1962 | else if (!strcmp(str, "no_cmci")) | 1967 | else if (!strcmp(str, "no_cmci")) |
| 1963 | mce_cmci_disabled = 1; | 1968 | cfg->cmci_disabled = true; |
| 1964 | else if (!strcmp(str, "dont_log_ce")) | 1969 | else if (!strcmp(str, "dont_log_ce")) |
| 1965 | mce_dont_log_ce = 1; | 1970 | cfg->dont_log_ce = true; |
| 1966 | else if (!strcmp(str, "ignore_ce")) | 1971 | else if (!strcmp(str, "ignore_ce")) |
| 1967 | mce_ignore_ce = 1; | 1972 | cfg->ignore_ce = true; |
| 1968 | else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) | 1973 | else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) |
| 1969 | mce_bootlog = (str[0] == 'b'); | 1974 | cfg->bootlog = (str[0] == 'b'); |
| 1970 | else if (!strcmp(str, "bios_cmci_threshold")) | 1975 | else if (!strcmp(str, "bios_cmci_threshold")) |
| 1971 | mce_bios_cmci_threshold = 1; | 1976 | cfg->bios_cmci_threshold = true; |
| 1972 | else if (isdigit(str[0])) { | 1977 | else if (isdigit(str[0])) { |
| 1973 | get_option(&str, &tolerant); | 1978 | get_option(&str, &(cfg->tolerant)); |
| 1974 | if (*str == ',') { | 1979 | if (*str == ',') { |
| 1975 | ++str; | 1980 | ++str; |
| 1976 | get_option(&str, &monarch_timeout); | 1981 | get_option(&str, &(cfg->monarch_timeout)); |
| 1977 | } | 1982 | } |
| 1978 | } else { | 1983 | } else { |
| 1979 | pr_info("mce argument %s ignored. Please use /sys\n", str); | 1984 | pr_info("mce argument %s ignored. Please use /sys\n", str); |
| @@ -2002,7 +2007,7 @@ static int mce_disable_error_reporting(void) | |||
| 2002 | { | 2007 | { |
| 2003 | int i; | 2008 | int i; |
| 2004 | 2009 | ||
| 2005 | for (i = 0; i < banks; i++) { | 2010 | for (i = 0; i < mca_cfg.banks; i++) { |
| 2006 | struct mce_bank *b = &mce_banks[i]; | 2011 | struct mce_bank *b = &mce_banks[i]; |
| 2007 | 2012 | ||
| 2008 | if (b->init) | 2013 | if (b->init) |
| @@ -2142,15 +2147,15 @@ static ssize_t set_ignore_ce(struct device *s, | |||
| 2142 | if (strict_strtoull(buf, 0, &new) < 0) | 2147 | if (strict_strtoull(buf, 0, &new) < 0) |
| 2143 | return -EINVAL; | 2148 | return -EINVAL; |
| 2144 | 2149 | ||
| 2145 | if (mce_ignore_ce ^ !!new) { | 2150 | if (mca_cfg.ignore_ce ^ !!new) { |
| 2146 | if (new) { | 2151 | if (new) { |
| 2147 | /* disable ce features */ | 2152 | /* disable ce features */ |
| 2148 | mce_timer_delete_all(); | 2153 | mce_timer_delete_all(); |
| 2149 | on_each_cpu(mce_disable_cmci, NULL, 1); | 2154 | on_each_cpu(mce_disable_cmci, NULL, 1); |
| 2150 | mce_ignore_ce = 1; | 2155 | mca_cfg.ignore_ce = true; |
| 2151 | } else { | 2156 | } else { |
| 2152 | /* enable ce features */ | 2157 | /* enable ce features */ |
| 2153 | mce_ignore_ce = 0; | 2158 | mca_cfg.ignore_ce = false; |
| 2154 | on_each_cpu(mce_enable_ce, (void *)1, 1); | 2159 | on_each_cpu(mce_enable_ce, (void *)1, 1); |
| 2155 | } | 2160 | } |
| 2156 | } | 2161 | } |
| @@ -2166,14 +2171,14 @@ static ssize_t set_cmci_disabled(struct device *s, | |||
| 2166 | if (strict_strtoull(buf, 0, &new) < 0) | 2171 | if (strict_strtoull(buf, 0, &new) < 0) |
| 2167 | return -EINVAL; | 2172 | return -EINVAL; |
| 2168 | 2173 | ||
| 2169 | if (mce_cmci_disabled ^ !!new) { | 2174 | if (mca_cfg.cmci_disabled ^ !!new) { |
| 2170 | if (new) { | 2175 | if (new) { |
| 2171 | /* disable cmci */ | 2176 | /* disable cmci */ |
| 2172 | on_each_cpu(mce_disable_cmci, NULL, 1); | 2177 | on_each_cpu(mce_disable_cmci, NULL, 1); |
| 2173 | mce_cmci_disabled = 1; | 2178 | mca_cfg.cmci_disabled = true; |
| 2174 | } else { | 2179 | } else { |
| 2175 | /* enable cmci */ | 2180 | /* enable cmci */ |
| 2176 | mce_cmci_disabled = 0; | 2181 | mca_cfg.cmci_disabled = false; |
| 2177 | on_each_cpu(mce_enable_ce, NULL, 1); | 2182 | on_each_cpu(mce_enable_ce, NULL, 1); |
| 2178 | } | 2183 | } |
| 2179 | } | 2184 | } |
| @@ -2190,9 +2195,9 @@ static ssize_t store_int_with_restart(struct device *s, | |||
| 2190 | } | 2195 | } |
| 2191 | 2196 | ||
| 2192 | static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); | 2197 | static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); |
| 2193 | static DEVICE_INT_ATTR(tolerant, 0644, tolerant); | 2198 | static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); |
| 2194 | static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); | 2199 | static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); |
| 2195 | static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); | 2200 | static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); |
| 2196 | 2201 | ||
| 2197 | static struct dev_ext_attribute dev_attr_check_interval = { | 2202 | static struct dev_ext_attribute dev_attr_check_interval = { |
| 2198 | __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), | 2203 | __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), |
| @@ -2200,13 +2205,13 @@ static struct dev_ext_attribute dev_attr_check_interval = { | |||
| 2200 | }; | 2205 | }; |
| 2201 | 2206 | ||
| 2202 | static struct dev_ext_attribute dev_attr_ignore_ce = { | 2207 | static struct dev_ext_attribute dev_attr_ignore_ce = { |
| 2203 | __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), | 2208 | __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), |
| 2204 | &mce_ignore_ce | 2209 | &mca_cfg.ignore_ce |
| 2205 | }; | 2210 | }; |
| 2206 | 2211 | ||
| 2207 | static struct dev_ext_attribute dev_attr_cmci_disabled = { | 2212 | static struct dev_ext_attribute dev_attr_cmci_disabled = { |
| 2208 | __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), | 2213 | __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), |
| 2209 | &mce_cmci_disabled | 2214 | &mca_cfg.cmci_disabled |
| 2210 | }; | 2215 | }; |
| 2211 | 2216 | ||
| 2212 | static struct device_attribute *mce_device_attrs[] = { | 2217 | static struct device_attribute *mce_device_attrs[] = { |
| @@ -2253,7 +2258,7 @@ static __cpuinit int mce_device_create(unsigned int cpu) | |||
| 2253 | if (err) | 2258 | if (err) |
| 2254 | goto error; | 2259 | goto error; |
| 2255 | } | 2260 | } |
| 2256 | for (j = 0; j < banks; j++) { | 2261 | for (j = 0; j < mca_cfg.banks; j++) { |
| 2257 | err = device_create_file(dev, &mce_banks[j].attr); | 2262 | err = device_create_file(dev, &mce_banks[j].attr); |
| 2258 | if (err) | 2263 | if (err) |
| 2259 | goto error2; | 2264 | goto error2; |
| @@ -2285,7 +2290,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu) | |||
| 2285 | for (i = 0; mce_device_attrs[i]; i++) | 2290 | for (i = 0; mce_device_attrs[i]; i++) |
| 2286 | device_remove_file(dev, mce_device_attrs[i]); | 2291 | device_remove_file(dev, mce_device_attrs[i]); |
| 2287 | 2292 | ||
| 2288 | for (i = 0; i < banks; i++) | 2293 | for (i = 0; i < mca_cfg.banks; i++) |
| 2289 | device_remove_file(dev, &mce_banks[i].attr); | 2294 | device_remove_file(dev, &mce_banks[i].attr); |
| 2290 | 2295 | ||
| 2291 | device_unregister(dev); | 2296 | device_unregister(dev); |
| @@ -2304,7 +2309,7 @@ static void __cpuinit mce_disable_cpu(void *h) | |||
| 2304 | 2309 | ||
| 2305 | if (!(action & CPU_TASKS_FROZEN)) | 2310 | if (!(action & CPU_TASKS_FROZEN)) |
| 2306 | cmci_clear(); | 2311 | cmci_clear(); |
| 2307 | for (i = 0; i < banks; i++) { | 2312 | for (i = 0; i < mca_cfg.banks; i++) { |
| 2308 | struct mce_bank *b = &mce_banks[i]; | 2313 | struct mce_bank *b = &mce_banks[i]; |
| 2309 | 2314 | ||
| 2310 | if (b->init) | 2315 | if (b->init) |
| @@ -2322,7 +2327,7 @@ static void __cpuinit mce_reenable_cpu(void *h) | |||
| 2322 | 2327 | ||
| 2323 | if (!(action & CPU_TASKS_FROZEN)) | 2328 | if (!(action & CPU_TASKS_FROZEN)) |
| 2324 | cmci_reenable(); | 2329 | cmci_reenable(); |
| 2325 | for (i = 0; i < banks; i++) { | 2330 | for (i = 0; i < mca_cfg.banks; i++) { |
| 2326 | struct mce_bank *b = &mce_banks[i]; | 2331 | struct mce_bank *b = &mce_banks[i]; |
| 2327 | 2332 | ||
| 2328 | if (b->init) | 2333 | if (b->init) |
| @@ -2375,7 +2380,7 @@ static __init void mce_init_banks(void) | |||
| 2375 | { | 2380 | { |
| 2376 | int i; | 2381 | int i; |
| 2377 | 2382 | ||
| 2378 | for (i = 0; i < banks; i++) { | 2383 | for (i = 0; i < mca_cfg.banks; i++) { |
| 2379 | struct mce_bank *b = &mce_banks[i]; | 2384 | struct mce_bank *b = &mce_banks[i]; |
| 2380 | struct device_attribute *a = &b->attr; | 2385 | struct device_attribute *a = &b->attr; |
| 2381 | 2386 | ||
| @@ -2426,7 +2431,7 @@ device_initcall_sync(mcheck_init_device); | |||
| 2426 | */ | 2431 | */ |
| 2427 | static int __init mcheck_disable(char *str) | 2432 | static int __init mcheck_disable(char *str) |
| 2428 | { | 2433 | { |
| 2429 | mce_disabled = 1; | 2434 | mca_cfg.disabled = true; |
| 2430 | return 1; | 2435 | return 1; |
| 2431 | } | 2436 | } |
| 2432 | __setup("nomce", mcheck_disable); | 2437 | __setup("nomce", mcheck_disable); |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 698b6ec12e0f..1ac581f38dfa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c | |||
| @@ -6,7 +6,7 @@ | |||
| 6 | * | 6 | * |
| 7 | * Written by Jacob Shin - AMD, Inc. | 7 | * Written by Jacob Shin - AMD, Inc. |
| 8 | * | 8 | * |
| 9 | * Support: borislav.petkov@amd.com | 9 | * Maintained by: Borislav Petkov <bp@alien8.de> |
| 10 | * | 10 | * |
| 11 | * April 2006 | 11 | * April 2006 |
| 12 | * - added support for AMD Family 0x10 processors | 12 | * - added support for AMD Family 0x10 processors |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 5f88abf07e9c..402c454fbff0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c | |||
| @@ -53,7 +53,7 @@ static int cmci_supported(int *banks) | |||
| 53 | { | 53 | { |
| 54 | u64 cap; | 54 | u64 cap; |
| 55 | 55 | ||
| 56 | if (mce_cmci_disabled || mce_ignore_ce) | 56 | if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce) |
| 57 | return 0; | 57 | return 0; |
| 58 | 58 | ||
| 59 | /* | 59 | /* |
| @@ -200,7 +200,7 @@ static void cmci_discover(int banks) | |||
| 200 | continue; | 200 | continue; |
| 201 | } | 201 | } |
| 202 | 202 | ||
| 203 | if (!mce_bios_cmci_threshold) { | 203 | if (!mca_cfg.bios_cmci_threshold) { |
| 204 | val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; | 204 | val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; |
| 205 | val |= CMCI_THRESHOLD; | 205 | val |= CMCI_THRESHOLD; |
| 206 | } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { | 206 | } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { |
| @@ -227,7 +227,7 @@ static void cmci_discover(int banks) | |||
| 227 | * set the thresholds properly or does not work with | 227 | * set the thresholds properly or does not work with |
| 228 | * this boot option. Note down now and report later. | 228 | * this boot option. Note down now and report later. |
| 229 | */ | 229 | */ |
| 230 | if (mce_bios_cmci_threshold && bios_zero_thresh && | 230 | if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && |
| 231 | (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) | 231 | (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) |
| 232 | bios_wrong_thresh = 1; | 232 | bios_wrong_thresh = 1; |
| 233 | } else { | 233 | } else { |
| @@ -235,7 +235,7 @@ static void cmci_discover(int banks) | |||
| 235 | } | 235 | } |
| 236 | } | 236 | } |
| 237 | raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); | 237 | raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); |
| 238 | if (mce_bios_cmci_threshold && bios_wrong_thresh) { | 238 | if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { |
| 239 | pr_info_once( | 239 | pr_info_once( |
| 240 | "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); | 240 | "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); |
| 241 | pr_info_once( | 241 | pr_info_once( |
| @@ -285,34 +285,39 @@ void cmci_clear(void) | |||
| 285 | raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); | 285 | raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); |
| 286 | } | 286 | } |
| 287 | 287 | ||
| 288 | static long cmci_rediscover_work_func(void *arg) | ||
| 289 | { | ||
| 290 | int banks; | ||
| 291 | |||
| 292 | /* Recheck banks in case CPUs don't all have the same */ | ||
| 293 | if (cmci_supported(&banks)) | ||
| 294 | cmci_discover(banks); | ||
| 295 | |||
| 296 | return 0; | ||
| 297 | } | ||
| 298 | |||
| 288 | /* | 299 | /* |
| 289 | * After a CPU went down cycle through all the others and rediscover | 300 | * After a CPU went down cycle through all the others and rediscover |
| 290 | * Must run in process context. | 301 | * Must run in process context. |
| 291 | */ | 302 | */ |
| 292 | void cmci_rediscover(int dying) | 303 | void cmci_rediscover(int dying) |
| 293 | { | 304 | { |
| 294 | int banks; | 305 | int cpu, banks; |
| 295 | int cpu; | ||
| 296 | cpumask_var_t old; | ||
| 297 | 306 | ||
| 298 | if (!cmci_supported(&banks)) | 307 | if (!cmci_supported(&banks)) |
| 299 | return; | 308 | return; |
| 300 | if (!alloc_cpumask_var(&old, GFP_KERNEL)) | ||
| 301 | return; | ||
| 302 | cpumask_copy(old, ¤t->cpus_allowed); | ||
| 303 | 309 | ||
| 304 | for_each_online_cpu(cpu) { | 310 | for_each_online_cpu(cpu) { |
| 305 | if (cpu == dying) | 311 | if (cpu == dying) |
| 306 | continue; | 312 | continue; |
| 307 | if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) | 313 | |
| 314 | if (cpu == smp_processor_id()) { | ||
| 315 | cmci_rediscover_work_func(NULL); | ||
| 308 | continue; | 316 | continue; |
| 309 | /* Recheck banks in case CPUs don't all have the same */ | 317 | } |
| 310 | if (cmci_supported(&banks)) | ||
| 311 | cmci_discover(banks); | ||
| 312 | } | ||
| 313 | 318 | ||
| 314 | set_cpus_allowed_ptr(current, old); | 319 | work_on_cpu(cpu, cmci_rediscover_work_func, NULL); |
| 315 | free_cpumask_var(old); | 320 | } |
| 316 | } | 321 | } |
| 317 | 322 | ||
| 318 | /* | 323 | /* |
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 6b96110bb0c3..726bf963c227 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c | |||
| @@ -606,7 +606,7 @@ void __init mtrr_bp_init(void) | |||
| 606 | 606 | ||
| 607 | /* | 607 | /* |
| 608 | * This is an AMD specific MSR, but we assume(hope?) that | 608 | * This is an AMD specific MSR, but we assume(hope?) that |
| 609 | * Intel will implement it to when they extend the address | 609 | * Intel will implement it too when they extend the address |
| 610 | * bus of the Xeon. | 610 | * bus of the Xeon. |
| 611 | */ | 611 | */ |
| 612 | if (cpuid_eax(0x80000000) >= 0x80000008) { | 612 | if (cpuid_eax(0x80000000) >= 0x80000008) { |
| @@ -695,11 +695,16 @@ void mtrr_ap_init(void) | |||
| 695 | } | 695 | } |
| 696 | 696 | ||
| 697 | /** | 697 | /** |
| 698 | * Save current fixed-range MTRR state of the BSP | 698 | * Save current fixed-range MTRR state of the first cpu in cpu_online_mask. |
| 699 | */ | 699 | */ |
| 700 | void mtrr_save_state(void) | 700 | void mtrr_save_state(void) |
| 701 | { | 701 | { |
| 702 | smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); | 702 | int first_cpu; |
| 703 | |||
| 704 | get_online_cpus(); | ||
| 705 | first_cpu = cpumask_first(cpu_online_mask); | ||
| 706 | smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); | ||
| 707 | put_online_cpus(); | ||
| 703 | } | 708 | } |
| 704 | 709 | ||
| 705 | void set_mtrr_aps_delayed_init(void) | 710 | void set_mtrr_aps_delayed_init(void) |
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4a3374e61a93..4428fd178bce 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
| @@ -1316,6 +1316,121 @@ static struct attribute_group x86_pmu_format_group = { | |||
| 1316 | .attrs = NULL, | 1316 | .attrs = NULL, |
| 1317 | }; | 1317 | }; |
| 1318 | 1318 | ||
| 1319 | struct perf_pmu_events_attr { | ||
| 1320 | struct device_attribute attr; | ||
| 1321 | u64 id; | ||
| 1322 | }; | ||
| 1323 | |||
| 1324 | /* | ||
| 1325 | * Remove all undefined events (x86_pmu.event_map(id) == 0) | ||
| 1326 | * out of events_attr attributes. | ||
| 1327 | */ | ||
| 1328 | static void __init filter_events(struct attribute **attrs) | ||
| 1329 | { | ||
| 1330 | int i, j; | ||
| 1331 | |||
| 1332 | for (i = 0; attrs[i]; i++) { | ||
| 1333 | if (x86_pmu.event_map(i)) | ||
| 1334 | continue; | ||
| 1335 | |||
| 1336 | for (j = i; attrs[j]; j++) | ||
| 1337 | attrs[j] = attrs[j + 1]; | ||
| 1338 | |||
| 1339 | /* Check the shifted attr. */ | ||
| 1340 | i--; | ||
| 1341 | } | ||
| 1342 | } | ||
| 1343 | |||
| 1344 | static ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, | ||
| 1345 | char *page) | ||
| 1346 | { | ||
| 1347 | struct perf_pmu_events_attr *pmu_attr = \ | ||
| 1348 | container_of(attr, struct perf_pmu_events_attr, attr); | ||
| 1349 | |||
| 1350 | u64 config = x86_pmu.event_map(pmu_attr->id); | ||
| 1351 | return x86_pmu.events_sysfs_show(page, config); | ||
| 1352 | } | ||
| 1353 | |||
| 1354 | #define EVENT_VAR(_id) event_attr_##_id | ||
| 1355 | #define EVENT_PTR(_id) &event_attr_##_id.attr.attr | ||
| 1356 | |||
| 1357 | #define EVENT_ATTR(_name, _id) \ | ||
| 1358 | static struct perf_pmu_events_attr EVENT_VAR(_id) = { \ | ||
| 1359 | .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ | ||
| 1360 | .id = PERF_COUNT_HW_##_id, \ | ||
| 1361 | }; | ||
| 1362 | |||
| 1363 | EVENT_ATTR(cpu-cycles, CPU_CYCLES ); | ||
| 1364 | EVENT_ATTR(instructions, INSTRUCTIONS ); | ||
| 1365 | EVENT_ATTR(cache-references, CACHE_REFERENCES ); | ||
| 1366 | EVENT_ATTR(cache-misses, CACHE_MISSES ); | ||
| 1367 | EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); | ||
| 1368 | EVENT_ATTR(branch-misses, BRANCH_MISSES ); | ||
| 1369 | EVENT_ATTR(bus-cycles, BUS_CYCLES ); | ||
| 1370 | EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); | ||
| 1371 | EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); | ||
| 1372 | EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); | ||
| 1373 | |||
| 1374 | static struct attribute *empty_attrs; | ||
| 1375 | |||
| 1376 | static struct attribute *events_attr[] = { | ||
| 1377 | EVENT_PTR(CPU_CYCLES), | ||
| 1378 | EVENT_PTR(INSTRUCTIONS), | ||
| 1379 | EVENT_PTR(CACHE_REFERENCES), | ||
| 1380 | EVENT_PTR(CACHE_MISSES), | ||
| 1381 | EVENT_PTR(BRANCH_INSTRUCTIONS), | ||
| 1382 | EVENT_PTR(BRANCH_MISSES), | ||
| 1383 | EVENT_PTR(BUS_CYCLES), | ||
| 1384 | EVENT_PTR(STALLED_CYCLES_FRONTEND), | ||
| 1385 | EVENT_PTR(STALLED_CYCLES_BACKEND), | ||
| 1386 | EVENT_PTR(REF_CPU_CYCLES), | ||
| 1387 | NULL, | ||
| 1388 | }; | ||
| 1389 | |||
| 1390 | static struct attribute_group x86_pmu_events_group = { | ||
| 1391 | .name = "events", | ||
| 1392 | .attrs = events_attr, | ||
| 1393 | }; | ||
| 1394 | |||
| 1395 | ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) | ||
| 1396 | { | ||
| 1397 | u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; | ||
| 1398 | u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; | ||
| 1399 | bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); | ||
| 1400 | bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); | ||
| 1401 | bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); | ||
| 1402 | bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); | ||
| 1403 | ssize_t ret; | ||
| 1404 | |||
| 1405 | /* | ||
| 1406 | * We have whole page size to spend and just little data | ||
| 1407 | * to write, so we can safely use sprintf. | ||
| 1408 | */ | ||
| 1409 | ret = sprintf(page, "event=0x%02llx", event); | ||
| 1410 | |||
| 1411 | if (umask) | ||
| 1412 | ret += sprintf(page + ret, ",umask=0x%02llx", umask); | ||
| 1413 | |||
| 1414 | if (edge) | ||
| 1415 | ret += sprintf(page + ret, ",edge"); | ||
| 1416 | |||
| 1417 | if (pc) | ||
| 1418 | ret += sprintf(page + ret, ",pc"); | ||
| 1419 | |||
| 1420 | if (any) | ||
| 1421 | ret += sprintf(page + ret, ",any"); | ||
| 1422 | |||
| 1423 | if (inv) | ||
| 1424 | ret += sprintf(page + ret, ",inv"); | ||
| 1425 | |||
| 1426 | if (cmask) | ||
| 1427 | ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); | ||
| 1428 | |||
| 1429 | ret += sprintf(page + ret, "\n"); | ||
| 1430 | |||
| 1431 | return ret; | ||
| 1432 | } | ||
| 1433 | |||
| 1319 | static int __init init_hw_perf_events(void) | 1434 | static int __init init_hw_perf_events(void) |
| 1320 | { | 1435 | { |
| 1321 | struct x86_pmu_quirk *quirk; | 1436 | struct x86_pmu_quirk *quirk; |
| @@ -1362,6 +1477,11 @@ static int __init init_hw_perf_events(void) | |||
| 1362 | x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ | 1477 | x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ |
| 1363 | x86_pmu_format_group.attrs = x86_pmu.format_attrs; | 1478 | x86_pmu_format_group.attrs = x86_pmu.format_attrs; |
| 1364 | 1479 | ||
| 1480 | if (!x86_pmu.events_sysfs_show) | ||
| 1481 | x86_pmu_events_group.attrs = &empty_attrs; | ||
| 1482 | else | ||
| 1483 | filter_events(x86_pmu_events_group.attrs); | ||
| 1484 | |||
| 1365 | pr_info("... version: %d\n", x86_pmu.version); | 1485 | pr_info("... version: %d\n", x86_pmu.version); |
| 1366 | pr_info("... bit width: %d\n", x86_pmu.cntval_bits); | 1486 | pr_info("... bit width: %d\n", x86_pmu.cntval_bits); |
| 1367 | pr_info("... generic registers: %d\n", x86_pmu.num_counters); | 1487 | pr_info("... generic registers: %d\n", x86_pmu.num_counters); |
| @@ -1651,6 +1771,7 @@ static struct attribute_group x86_pmu_attr_group = { | |||
| 1651 | static const struct attribute_group *x86_pmu_attr_groups[] = { | 1771 | static const struct attribute_group *x86_pmu_attr_groups[] = { |
| 1652 | &x86_pmu_attr_group, | 1772 | &x86_pmu_attr_group, |
| 1653 | &x86_pmu_format_group, | 1773 | &x86_pmu_format_group, |
| 1774 | &x86_pmu_events_group, | ||
| 1654 | NULL, | 1775 | NULL, |
| 1655 | }; | 1776 | }; |
| 1656 | 1777 | ||
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 271d25700297..115c1ea97746 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h | |||
| @@ -354,6 +354,8 @@ struct x86_pmu { | |||
| 354 | int attr_rdpmc; | 354 | int attr_rdpmc; |
| 355 | struct attribute **format_attrs; | 355 | struct attribute **format_attrs; |
| 356 | 356 | ||
| 357 | ssize_t (*events_sysfs_show)(char *page, u64 config); | ||
| 358 | |||
| 357 | /* | 359 | /* |
| 358 | * CPU Hotplug hooks | 360 | * CPU Hotplug hooks |
| 359 | */ | 361 | */ |
| @@ -536,6 +538,9 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) | |||
| 536 | regs->ip = ip; | 538 | regs->ip = ip; |
| 537 | } | 539 | } |
| 538 | 540 | ||
| 541 | ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); | ||
| 542 | ssize_t intel_event_sysfs_show(char *page, u64 config); | ||
| 543 | |||
| 539 | #ifdef CONFIG_CPU_SUP_AMD | 544 | #ifdef CONFIG_CPU_SUP_AMD |
| 540 | 545 | ||
| 541 | int amd_pmu_init(void); | 546 | int amd_pmu_init(void); |
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 4528ae7b6ec4..c93bc4e813a0 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c | |||
| @@ -568,6 +568,14 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev | |||
| 568 | } | 568 | } |
| 569 | } | 569 | } |
| 570 | 570 | ||
| 571 | static ssize_t amd_event_sysfs_show(char *page, u64 config) | ||
| 572 | { | ||
| 573 | u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | | ||
| 574 | (config & AMD64_EVENTSEL_EVENT) >> 24; | ||
| 575 | |||
| 576 | return x86_event_sysfs_show(page, config, event); | ||
| 577 | } | ||
| 578 | |||
| 571 | static __initconst const struct x86_pmu amd_pmu = { | 579 | static __initconst const struct x86_pmu amd_pmu = { |
| 572 | .name = "AMD", | 580 | .name = "AMD", |
| 573 | .handle_irq = x86_pmu_handle_irq, | 581 | .handle_irq = x86_pmu_handle_irq, |
| @@ -591,6 +599,7 @@ static __initconst const struct x86_pmu amd_pmu = { | |||
| 591 | .put_event_constraints = amd_put_event_constraints, | 599 | .put_event_constraints = amd_put_event_constraints, |
| 592 | 600 | ||
| 593 | .format_attrs = amd_format_attr, | 601 | .format_attrs = amd_format_attr, |
| 602 | .events_sysfs_show = amd_event_sysfs_show, | ||
| 594 | 603 | ||
| 595 | .cpu_prepare = amd_pmu_cpu_prepare, | 604 | .cpu_prepare = amd_pmu_cpu_prepare, |
| 596 | .cpu_starting = amd_pmu_cpu_starting, | 605 | .cpu_starting = amd_pmu_cpu_starting, |
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 324bb523d9d9..93b9e1181f83 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c | |||
| @@ -1603,6 +1603,13 @@ static struct attribute *intel_arch_formats_attr[] = { | |||
| 1603 | NULL, | 1603 | NULL, |
| 1604 | }; | 1604 | }; |
| 1605 | 1605 | ||
| 1606 | ssize_t intel_event_sysfs_show(char *page, u64 config) | ||
| 1607 | { | ||
| 1608 | u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); | ||
| 1609 | |||
| 1610 | return x86_event_sysfs_show(page, config, event); | ||
| 1611 | } | ||
| 1612 | |||
| 1606 | static __initconst const struct x86_pmu core_pmu = { | 1613 | static __initconst const struct x86_pmu core_pmu = { |
| 1607 | .name = "core", | 1614 | .name = "core", |
| 1608 | .handle_irq = x86_pmu_handle_irq, | 1615 | .handle_irq = x86_pmu_handle_irq, |
| @@ -1628,6 +1635,7 @@ static __initconst const struct x86_pmu core_pmu = { | |||
| 1628 | .event_constraints = intel_core_event_constraints, | 1635 | .event_constraints = intel_core_event_constraints, |
| 1629 | .guest_get_msrs = core_guest_get_msrs, | 1636 | .guest_get_msrs = core_guest_get_msrs, |
| 1630 | .format_attrs = intel_arch_formats_attr, | 1637 | .format_attrs = intel_arch_formats_attr, |
| 1638 | .events_sysfs_show = intel_event_sysfs_show, | ||
| 1631 | }; | 1639 | }; |
| 1632 | 1640 | ||
| 1633 | struct intel_shared_regs *allocate_shared_regs(int cpu) | 1641 | struct intel_shared_regs *allocate_shared_regs(int cpu) |
| @@ -1766,6 +1774,7 @@ static __initconst const struct x86_pmu intel_pmu = { | |||
| 1766 | .pebs_aliases = intel_pebs_aliases_core2, | 1774 | .pebs_aliases = intel_pebs_aliases_core2, |
| 1767 | 1775 | ||
| 1768 | .format_attrs = intel_arch3_formats_attr, | 1776 | .format_attrs = intel_arch3_formats_attr, |
| 1777 | .events_sysfs_show = intel_event_sysfs_show, | ||
| 1769 | 1778 | ||
| 1770 | .cpu_prepare = intel_pmu_cpu_prepare, | 1779 | .cpu_prepare = intel_pmu_cpu_prepare, |
| 1771 | .cpu_starting = intel_pmu_cpu_starting, | 1780 | .cpu_starting = intel_pmu_cpu_starting, |
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 3cf3d97cce3a..b43200dbfe7e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c | |||
| @@ -2500,7 +2500,7 @@ static bool pcidrv_registered; | |||
| 2500 | /* | 2500 | /* |
| 2501 | * add a pci uncore device | 2501 | * add a pci uncore device |
| 2502 | */ | 2502 | */ |
| 2503 | static int __devinit uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) | 2503 | static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) |
| 2504 | { | 2504 | { |
| 2505 | struct intel_uncore_pmu *pmu; | 2505 | struct intel_uncore_pmu *pmu; |
| 2506 | struct intel_uncore_box *box; | 2506 | struct intel_uncore_box *box; |
| @@ -2571,8 +2571,8 @@ static void uncore_pci_remove(struct pci_dev *pdev) | |||
| 2571 | kfree(box); | 2571 | kfree(box); |
| 2572 | } | 2572 | } |
| 2573 | 2573 | ||
| 2574 | static int __devinit uncore_pci_probe(struct pci_dev *pdev, | 2574 | static int uncore_pci_probe(struct pci_dev *pdev, |
| 2575 | const struct pci_device_id *id) | 2575 | const struct pci_device_id *id) |
| 2576 | { | 2576 | { |
| 2577 | struct intel_uncore_type *type; | 2577 | struct intel_uncore_type *type; |
| 2578 | 2578 | ||
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 7d0270bd793e..f2af39f5dc3d 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c | |||
| @@ -227,6 +227,8 @@ static __initconst const struct x86_pmu p6_pmu = { | |||
| 227 | .event_constraints = p6_event_constraints, | 227 | .event_constraints = p6_event_constraints, |
| 228 | 228 | ||
| 229 | .format_attrs = intel_p6_formats_attr, | 229 | .format_attrs = intel_p6_formats_attr, |
| 230 | .events_sysfs_show = intel_event_sysfs_show, | ||
| 231 | |||
| 230 | }; | 232 | }; |
| 231 | 233 | ||
| 232 | __init int p6_pmu_init(void) | 234 | __init int p6_pmu_init(void) |
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index fbd895562292..3286a92e662a 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c | |||
| @@ -26,11 +26,6 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, | |||
| 26 | #ifdef CONFIG_X86_32 | 26 | #ifdef CONFIG_X86_32 |
| 27 | static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) | 27 | static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) |
| 28 | { | 28 | { |
| 29 | /* | ||
| 30 | * We use exception 16 if we have hardware math and we've either seen | ||
| 31 | * it or the CPU claims it is internal | ||
| 32 | */ | ||
| 33 | int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu); | ||
| 34 | seq_printf(m, | 29 | seq_printf(m, |
| 35 | "fdiv_bug\t: %s\n" | 30 | "fdiv_bug\t: %s\n" |
| 36 | "hlt_bug\t\t: %s\n" | 31 | "hlt_bug\t\t: %s\n" |
| @@ -45,7 +40,7 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) | |||
| 45 | c->f00f_bug ? "yes" : "no", | 40 | c->f00f_bug ? "yes" : "no", |
| 46 | c->coma_bug ? "yes" : "no", | 41 | c->coma_bug ? "yes" : "no", |
| 47 | c->hard_math ? "yes" : "no", | 42 | c->hard_math ? "yes" : "no", |
| 48 | fpu_exception ? "yes" : "no", | 43 | c->hard_math ? "yes" : "no", |
| 49 | c->cpuid_level, | 44 | c->cpuid_level, |
| 50 | c->wp_works_ok ? "yes" : "no"); | 45 | c->wp_works_ok ? "yes" : "no"); |
| 51 | } | 46 | } |
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 13ad89971d47..74467feb4dc5 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/delay.h> | 16 | #include <linux/delay.h> |
| 17 | #include <linux/elf.h> | 17 | #include <linux/elf.h> |
| 18 | #include <linux/elfcore.h> | 18 | #include <linux/elfcore.h> |
| 19 | #include <linux/module.h> | ||
| 19 | 20 | ||
| 20 | #include <asm/processor.h> | 21 | #include <asm/processor.h> |
| 21 | #include <asm/hardirq.h> | 22 | #include <asm/hardirq.h> |
| @@ -30,6 +31,27 @@ | |||
| 30 | 31 | ||
| 31 | int in_crash_kexec; | 32 | int in_crash_kexec; |
| 32 | 33 | ||
| 34 | /* | ||
| 35 | * This is used to VMCLEAR all VMCSs loaded on the | ||
| 36 | * processor. And when loading kvm_intel module, the | ||
| 37 | * callback function pointer will be assigned. | ||
| 38 | * | ||
| 39 | * protected by rcu. | ||
| 40 | */ | ||
| 41 | crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; | ||
| 42 | EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); | ||
| 43 | |||
| 44 | static inline void cpu_crash_vmclear_loaded_vmcss(void) | ||
| 45 | { | ||
| 46 | crash_vmclear_fn *do_vmclear_operation = NULL; | ||
| 47 | |||
| 48 | rcu_read_lock(); | ||
| 49 | do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); | ||
| 50 | if (do_vmclear_operation) | ||
| 51 | do_vmclear_operation(); | ||
| 52 | rcu_read_unlock(); | ||
| 53 | } | ||
| 54 | |||
| 33 | #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) | 55 | #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) |
| 34 | 56 | ||
| 35 | static void kdump_nmi_callback(int cpu, struct pt_regs *regs) | 57 | static void kdump_nmi_callback(int cpu, struct pt_regs *regs) |
| @@ -46,6 +68,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) | |||
| 46 | #endif | 68 | #endif |
| 47 | crash_save_cpu(regs, cpu); | 69 | crash_save_cpu(regs, cpu); |
| 48 | 70 | ||
| 71 | /* | ||
| 72 | * VMCLEAR VMCSs loaded on all cpus if needed. | ||
| 73 | */ | ||
| 74 | cpu_crash_vmclear_loaded_vmcss(); | ||
| 75 | |||
| 49 | /* Disable VMX or SVM if needed. | 76 | /* Disable VMX or SVM if needed. |
| 50 | * | 77 | * |
| 51 | * We need to disable virtualization on all CPUs. | 78 | * We need to disable virtualization on all CPUs. |
| @@ -88,6 +115,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs) | |||
| 88 | 115 | ||
| 89 | kdump_nmi_shootdown_cpus(); | 116 | kdump_nmi_shootdown_cpus(); |
| 90 | 117 | ||
| 118 | /* | ||
| 119 | * VMCLEAR VMCSs loaded on this cpu if needed. | ||
| 120 | */ | ||
| 121 | cpu_crash_vmclear_loaded_vmcss(); | ||
| 122 | |||
| 91 | /* Booting kdump kernel with VMX or SVM enabled won't work, | 123 | /* Booting kdump kernel with VMX or SVM enabled won't work, |
| 92 | * because (among other limitations) we can't disable paging | 124 | * because (among other limitations) we can't disable paging |
| 93 | * with the virt flags. | 125 | * with the virt flags. |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 88b725aa1d52..ff84d5469d77 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
| @@ -739,30 +739,11 @@ ENTRY(ptregs_##name) ; \ | |||
| 739 | ENDPROC(ptregs_##name) | 739 | ENDPROC(ptregs_##name) |
| 740 | 740 | ||
| 741 | PTREGSCALL1(iopl) | 741 | PTREGSCALL1(iopl) |
| 742 | PTREGSCALL0(fork) | ||
| 743 | PTREGSCALL0(vfork) | ||
| 744 | PTREGSCALL2(sigaltstack) | ||
| 745 | PTREGSCALL0(sigreturn) | 742 | PTREGSCALL0(sigreturn) |
| 746 | PTREGSCALL0(rt_sigreturn) | 743 | PTREGSCALL0(rt_sigreturn) |
| 747 | PTREGSCALL2(vm86) | 744 | PTREGSCALL2(vm86) |
| 748 | PTREGSCALL1(vm86old) | 745 | PTREGSCALL1(vm86old) |
| 749 | 746 | ||
| 750 | /* Clone is an oddball. The 4th arg is in %edi */ | ||
| 751 | ENTRY(ptregs_clone) | ||
| 752 | CFI_STARTPROC | ||
| 753 | leal 4(%esp),%eax | ||
| 754 | pushl_cfi %eax | ||
| 755 | pushl_cfi PT_EDI(%eax) | ||
| 756 | movl PT_EDX(%eax),%ecx | ||
| 757 | movl PT_ECX(%eax),%edx | ||
| 758 | movl PT_EBX(%eax),%eax | ||
| 759 | call sys_clone | ||
| 760 | addl $8,%esp | ||
| 761 | CFI_ADJUST_CFA_OFFSET -8 | ||
| 762 | ret | ||
| 763 | CFI_ENDPROC | ||
| 764 | ENDPROC(ptregs_clone) | ||
| 765 | |||
| 766 | .macro FIXUP_ESPFIX_STACK | 747 | .macro FIXUP_ESPFIX_STACK |
| 767 | /* | 748 | /* |
| 768 | * Switch back for ESPFIX stack to the normal zerobased stack | 749 | * Switch back for ESPFIX stack to the normal zerobased stack |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b51b2c7ee51f..07a7a04529bc 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
| @@ -56,7 +56,7 @@ | |||
| 56 | #include <asm/ftrace.h> | 56 | #include <asm/ftrace.h> |
| 57 | #include <asm/percpu.h> | 57 | #include <asm/percpu.h> |
| 58 | #include <asm/asm.h> | 58 | #include <asm/asm.h> |
| 59 | #include <asm/rcu.h> | 59 | #include <asm/context_tracking.h> |
| 60 | #include <asm/smap.h> | 60 | #include <asm/smap.h> |
| 61 | #include <linux/err.h> | 61 | #include <linux/err.h> |
| 62 | 62 | ||
| @@ -845,10 +845,25 @@ ENTRY(\label) | |||
| 845 | END(\label) | 845 | END(\label) |
| 846 | .endm | 846 | .endm |
| 847 | 847 | ||
| 848 | PTREGSCALL stub_clone, sys_clone, %r8 | 848 | .macro FORK_LIKE func |
| 849 | PTREGSCALL stub_fork, sys_fork, %rdi | 849 | ENTRY(stub_\func) |
| 850 | PTREGSCALL stub_vfork, sys_vfork, %rdi | 850 | CFI_STARTPROC |
| 851 | PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx | 851 | popq %r11 /* save return address */ |
| 852 | PARTIAL_FRAME 0 | ||
| 853 | SAVE_REST | ||
| 854 | pushq %r11 /* put it back on stack */ | ||
| 855 | FIXUP_TOP_OF_STACK %r11, 8 | ||
| 856 | DEFAULT_FRAME 0 8 /* offset 8: return address */ | ||
| 857 | call sys_\func | ||
| 858 | RESTORE_TOP_OF_STACK %r11, 8 | ||
| 859 | ret $REST_SKIP /* pop extended registers */ | ||
| 860 | CFI_ENDPROC | ||
| 861 | END(stub_\func) | ||
| 862 | .endm | ||
| 863 | |||
| 864 | FORK_LIKE clone | ||
| 865 | FORK_LIKE fork | ||
| 866 | FORK_LIKE vfork | ||
| 852 | PTREGSCALL stub_iopl, sys_iopl, %rsi | 867 | PTREGSCALL stub_iopl, sys_iopl, %rsi |
| 853 | 868 | ||
| 854 | ENTRY(ptregscall_common) | 869 | ENTRY(ptregscall_common) |
| @@ -897,8 +912,6 @@ ENTRY(stub_rt_sigreturn) | |||
| 897 | END(stub_rt_sigreturn) | 912 | END(stub_rt_sigreturn) |
| 898 | 913 | ||
| 899 | #ifdef CONFIG_X86_X32_ABI | 914 | #ifdef CONFIG_X86_X32_ABI |
| 900 | PTREGSCALL stub_x32_sigaltstack, sys32_sigaltstack, %rdx | ||
| 901 | |||
| 902 | ENTRY(stub_x32_rt_sigreturn) | 915 | ENTRY(stub_x32_rt_sigreturn) |
| 903 | CFI_STARTPROC | 916 | CFI_STARTPROC |
| 904 | addq $8, %rsp | 917 | addq $8, %rsp |
| @@ -995,8 +1008,8 @@ END(interrupt) | |||
| 995 | */ | 1008 | */ |
| 996 | .p2align CONFIG_X86_L1_CACHE_SHIFT | 1009 | .p2align CONFIG_X86_L1_CACHE_SHIFT |
| 997 | common_interrupt: | 1010 | common_interrupt: |
| 998 | ASM_CLAC | ||
| 999 | XCPT_FRAME | 1011 | XCPT_FRAME |
| 1012 | ASM_CLAC | ||
| 1000 | addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ | 1013 | addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ |
| 1001 | interrupt do_IRQ | 1014 | interrupt do_IRQ |
| 1002 | /* 0(%rsp): old_rsp-ARGOFFSET */ | 1015 | /* 0(%rsp): old_rsp-ARGOFFSET */ |
| @@ -1135,8 +1148,8 @@ END(common_interrupt) | |||
| 1135 | */ | 1148 | */ |
| 1136 | .macro apicinterrupt num sym do_sym | 1149 | .macro apicinterrupt num sym do_sym |
| 1137 | ENTRY(\sym) | 1150 | ENTRY(\sym) |
| 1138 | ASM_CLAC | ||
| 1139 | INTR_FRAME | 1151 | INTR_FRAME |
| 1152 | ASM_CLAC | ||
| 1140 | pushq_cfi $~(\num) | 1153 | pushq_cfi $~(\num) |
| 1141 | .Lcommon_\sym: | 1154 | .Lcommon_\sym: |
| 1142 | interrupt \do_sym | 1155 | interrupt \do_sym |
| @@ -1190,8 +1203,8 @@ apicinterrupt IRQ_WORK_VECTOR \ | |||
| 1190 | */ | 1203 | */ |
| 1191 | .macro zeroentry sym do_sym | 1204 | .macro zeroentry sym do_sym |
| 1192 | ENTRY(\sym) | 1205 | ENTRY(\sym) |
| 1193 | ASM_CLAC | ||
| 1194 | INTR_FRAME | 1206 | INTR_FRAME |
| 1207 | ASM_CLAC | ||
| 1195 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 1208 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
| 1196 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ | 1209 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ |
| 1197 | subq $ORIG_RAX-R15, %rsp | 1210 | subq $ORIG_RAX-R15, %rsp |
| @@ -1208,8 +1221,8 @@ END(\sym) | |||
| 1208 | 1221 | ||
| 1209 | .macro paranoidzeroentry sym do_sym | 1222 | .macro paranoidzeroentry sym do_sym |
| 1210 | ENTRY(\sym) | 1223 | ENTRY(\sym) |
| 1211 | ASM_CLAC | ||
| 1212 | INTR_FRAME | 1224 | INTR_FRAME |
| 1225 | ASM_CLAC | ||
| 1213 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 1226 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
| 1214 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ | 1227 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ |
| 1215 | subq $ORIG_RAX-R15, %rsp | 1228 | subq $ORIG_RAX-R15, %rsp |
| @@ -1227,8 +1240,8 @@ END(\sym) | |||
| 1227 | #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) | 1240 | #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) |
| 1228 | .macro paranoidzeroentry_ist sym do_sym ist | 1241 | .macro paranoidzeroentry_ist sym do_sym ist |
| 1229 | ENTRY(\sym) | 1242 | ENTRY(\sym) |
| 1230 | ASM_CLAC | ||
| 1231 | INTR_FRAME | 1243 | INTR_FRAME |
| 1244 | ASM_CLAC | ||
| 1232 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 1245 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
| 1233 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ | 1246 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ |
| 1234 | subq $ORIG_RAX-R15, %rsp | 1247 | subq $ORIG_RAX-R15, %rsp |
| @@ -1247,8 +1260,8 @@ END(\sym) | |||
| 1247 | 1260 | ||
| 1248 | .macro errorentry sym do_sym | 1261 | .macro errorentry sym do_sym |
| 1249 | ENTRY(\sym) | 1262 | ENTRY(\sym) |
| 1250 | ASM_CLAC | ||
| 1251 | XCPT_FRAME | 1263 | XCPT_FRAME |
| 1264 | ASM_CLAC | ||
| 1252 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 1265 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
| 1253 | subq $ORIG_RAX-R15, %rsp | 1266 | subq $ORIG_RAX-R15, %rsp |
| 1254 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 | 1267 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 |
| @@ -1266,8 +1279,8 @@ END(\sym) | |||
| 1266 | /* error code is on the stack already */ | 1279 | /* error code is on the stack already */ |
| 1267 | .macro paranoiderrorentry sym do_sym | 1280 | .macro paranoiderrorentry sym do_sym |
| 1268 | ENTRY(\sym) | 1281 | ENTRY(\sym) |
| 1269 | ASM_CLAC | ||
| 1270 | XCPT_FRAME | 1282 | XCPT_FRAME |
| 1283 | ASM_CLAC | ||
| 1271 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 1284 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
| 1272 | subq $ORIG_RAX-R15, %rsp | 1285 | subq $ORIG_RAX-R15, %rsp |
| 1273 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 | 1286 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 |
| @@ -1699,9 +1712,10 @@ nested_nmi: | |||
| 1699 | 1712 | ||
| 1700 | 1: | 1713 | 1: |
| 1701 | /* Set up the interrupted NMIs stack to jump to repeat_nmi */ | 1714 | /* Set up the interrupted NMIs stack to jump to repeat_nmi */ |
| 1702 | leaq -6*8(%rsp), %rdx | 1715 | leaq -1*8(%rsp), %rdx |
| 1703 | movq %rdx, %rsp | 1716 | movq %rdx, %rsp |
| 1704 | CFI_ADJUST_CFA_OFFSET 6*8 | 1717 | CFI_ADJUST_CFA_OFFSET 1*8 |
| 1718 | leaq -10*8(%rsp), %rdx | ||
| 1705 | pushq_cfi $__KERNEL_DS | 1719 | pushq_cfi $__KERNEL_DS |
| 1706 | pushq_cfi %rdx | 1720 | pushq_cfi %rdx |
| 1707 | pushfq_cfi | 1721 | pushfq_cfi |
| @@ -1709,8 +1723,8 @@ nested_nmi: | |||
| 1709 | pushq_cfi $repeat_nmi | 1723 | pushq_cfi $repeat_nmi |
| 1710 | 1724 | ||
| 1711 | /* Put stack back */ | 1725 | /* Put stack back */ |
| 1712 | addq $(11*8), %rsp | 1726 | addq $(6*8), %rsp |
| 1713 | CFI_ADJUST_CFA_OFFSET -11*8 | 1727 | CFI_ADJUST_CFA_OFFSET -6*8 |
| 1714 | 1728 | ||
| 1715 | nested_nmi_out: | 1729 | nested_nmi_out: |
| 1716 | popq_cfi %rdx | 1730 | popq_cfi %rdx |
| @@ -1736,18 +1750,18 @@ first_nmi: | |||
| 1736 | * +-------------------------+ | 1750 | * +-------------------------+ |
| 1737 | * | NMI executing variable | | 1751 | * | NMI executing variable | |
| 1738 | * +-------------------------+ | 1752 | * +-------------------------+ |
| 1739 | * | Saved SS | | ||
| 1740 | * | Saved Return RSP | | ||
| 1741 | * | Saved RFLAGS | | ||
| 1742 | * | Saved CS | | ||
| 1743 | * | Saved RIP | | ||
| 1744 | * +-------------------------+ | ||
| 1745 | * | copied SS | | 1753 | * | copied SS | |
| 1746 | * | copied Return RSP | | 1754 | * | copied Return RSP | |
| 1747 | * | copied RFLAGS | | 1755 | * | copied RFLAGS | |
| 1748 | * | copied CS | | 1756 | * | copied CS | |
| 1749 | * | copied RIP | | 1757 | * | copied RIP | |
| 1750 | * +-------------------------+ | 1758 | * +-------------------------+ |
| 1759 | * | Saved SS | | ||
| 1760 | * | Saved Return RSP | | ||
| 1761 | * | Saved RFLAGS | | ||
| 1762 | * | Saved CS | | ||
| 1763 | * | Saved RIP | | ||
| 1764 | * +-------------------------+ | ||
| 1751 | * | pt_regs | | 1765 | * | pt_regs | |
| 1752 | * +-------------------------+ | 1766 | * +-------------------------+ |
| 1753 | * | 1767 | * |
| @@ -1763,9 +1777,14 @@ first_nmi: | |||
| 1763 | /* Set the NMI executing variable on the stack. */ | 1777 | /* Set the NMI executing variable on the stack. */ |
| 1764 | pushq_cfi $1 | 1778 | pushq_cfi $1 |
| 1765 | 1779 | ||
| 1780 | /* | ||
| 1781 | * Leave room for the "copied" frame | ||
| 1782 | */ | ||
| 1783 | subq $(5*8), %rsp | ||
| 1784 | |||
| 1766 | /* Copy the stack frame to the Saved frame */ | 1785 | /* Copy the stack frame to the Saved frame */ |
| 1767 | .rept 5 | 1786 | .rept 5 |
| 1768 | pushq_cfi 6*8(%rsp) | 1787 | pushq_cfi 11*8(%rsp) |
| 1769 | .endr | 1788 | .endr |
| 1770 | CFI_DEF_CFA_OFFSET SS+8-RIP | 1789 | CFI_DEF_CFA_OFFSET SS+8-RIP |
| 1771 | 1790 | ||
| @@ -1786,12 +1805,15 @@ repeat_nmi: | |||
| 1786 | * is benign for the non-repeat case, where 1 was pushed just above | 1805 | * is benign for the non-repeat case, where 1 was pushed just above |
| 1787 | * to this very stack slot). | 1806 | * to this very stack slot). |
| 1788 | */ | 1807 | */ |
| 1789 | movq $1, 5*8(%rsp) | 1808 | movq $1, 10*8(%rsp) |
| 1790 | 1809 | ||
| 1791 | /* Make another copy, this one may be modified by nested NMIs */ | 1810 | /* Make another copy, this one may be modified by nested NMIs */ |
| 1811 | addq $(10*8), %rsp | ||
| 1812 | CFI_ADJUST_CFA_OFFSET -10*8 | ||
| 1792 | .rept 5 | 1813 | .rept 5 |
| 1793 | pushq_cfi 4*8(%rsp) | 1814 | pushq_cfi -6*8(%rsp) |
| 1794 | .endr | 1815 | .endr |
| 1816 | subq $(5*8), %rsp | ||
| 1795 | CFI_DEF_CFA_OFFSET SS+8-RIP | 1817 | CFI_DEF_CFA_OFFSET SS+8-RIP |
| 1796 | end_repeat_nmi: | 1818 | end_repeat_nmi: |
| 1797 | 1819 | ||
| @@ -1842,8 +1864,12 @@ nmi_swapgs: | |||
| 1842 | SWAPGS_UNSAFE_STACK | 1864 | SWAPGS_UNSAFE_STACK |
| 1843 | nmi_restore: | 1865 | nmi_restore: |
| 1844 | RESTORE_ALL 8 | 1866 | RESTORE_ALL 8 |
| 1867 | |||
| 1868 | /* Pop the extra iret frame */ | ||
| 1869 | addq $(5*8), %rsp | ||
| 1870 | |||
| 1845 | /* Clear the NMI executing stack variable */ | 1871 | /* Clear the NMI executing stack variable */ |
| 1846 | movq $0, 10*8(%rsp) | 1872 | movq $0, 5*8(%rsp) |
| 1847 | jmp irq_return | 1873 | jmp irq_return |
| 1848 | CFI_ENDPROC | 1874 | CFI_ENDPROC |
| 1849 | END(nmi) | 1875 | END(nmi) |
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 957a47aec64e..8e7f6556028f 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
| @@ -266,6 +266,19 @@ num_subarch_entries = (. - subarch_entries) / 4 | |||
| 266 | jmp default_entry | 266 | jmp default_entry |
| 267 | #endif /* CONFIG_PARAVIRT */ | 267 | #endif /* CONFIG_PARAVIRT */ |
| 268 | 268 | ||
| 269 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 270 | /* | ||
| 271 | * Boot CPU0 entry point. It's called from play_dead(). Everything has been set | ||
| 272 | * up already except stack. We just set up stack here. Then call | ||
| 273 | * start_secondary(). | ||
| 274 | */ | ||
| 275 | ENTRY(start_cpu0) | ||
| 276 | movl stack_start, %ecx | ||
| 277 | movl %ecx, %esp | ||
| 278 | jmp *(initial_code) | ||
| 279 | ENDPROC(start_cpu0) | ||
| 280 | #endif | ||
| 281 | |||
| 269 | /* | 282 | /* |
| 270 | * Non-boot CPU entry point; entered from trampoline.S | 283 | * Non-boot CPU entry point; entered from trampoline.S |
| 271 | * We can't lgdt here, because lgdt itself uses a data segment, but | 284 | * We can't lgdt here, because lgdt itself uses a data segment, but |
| @@ -292,8 +305,8 @@ default_entry: | |||
| 292 | * be using the global pages. | 305 | * be using the global pages. |
| 293 | * | 306 | * |
| 294 | * NOTE! If we are on a 486 we may have no cr4 at all! | 307 | * NOTE! If we are on a 486 we may have no cr4 at all! |
| 295 | * Specifically, cr4 exists if and only if CPUID exists, | 308 | * Specifically, cr4 exists if and only if CPUID exists |
| 296 | * which in turn exists if and only if EFLAGS.ID exists. | 309 | * and has flags other than the FPU flag set. |
| 297 | */ | 310 | */ |
| 298 | movl $X86_EFLAGS_ID,%ecx | 311 | movl $X86_EFLAGS_ID,%ecx |
| 299 | pushl %ecx | 312 | pushl %ecx |
| @@ -308,6 +321,11 @@ default_entry: | |||
| 308 | testl %ecx,%eax | 321 | testl %ecx,%eax |
| 309 | jz 6f # No ID flag = no CPUID = no CR4 | 322 | jz 6f # No ID flag = no CPUID = no CR4 |
| 310 | 323 | ||
| 324 | movl $1,%eax | ||
| 325 | cpuid | ||
| 326 | andl $~1,%edx # Ignore CPUID.FPU | ||
| 327 | jz 6f # No flags or only CPUID.FPU = no CR4 | ||
| 328 | |||
| 311 | movl pa(mmu_cr4_features),%eax | 329 | movl pa(mmu_cr4_features),%eax |
| 312 | movl %eax,%cr4 | 330 | movl %eax,%cr4 |
| 313 | 331 | ||
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 94bf9cc2c7ee..980053c4b9cc 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
| @@ -252,6 +252,22 @@ ENTRY(secondary_startup_64) | |||
| 252 | pushq %rax # target address in negative space | 252 | pushq %rax # target address in negative space |
| 253 | lretq | 253 | lretq |
| 254 | 254 | ||
| 255 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 256 | /* | ||
| 257 | * Boot CPU0 entry point. It's called from play_dead(). Everything has been set | ||
| 258 | * up already except stack. We just set up stack here. Then call | ||
| 259 | * start_secondary(). | ||
| 260 | */ | ||
| 261 | ENTRY(start_cpu0) | ||
| 262 | movq stack_start(%rip),%rsp | ||
| 263 | movq initial_code(%rip),%rax | ||
| 264 | pushq $0 # fake return address to stop unwinder | ||
| 265 | pushq $__KERNEL_CS # set correct cs | ||
| 266 | pushq %rax # target address in negative space | ||
| 267 | lretq | ||
| 268 | ENDPROC(start_cpu0) | ||
| 269 | #endif | ||
| 270 | |||
| 255 | /* SMP bootup changes these two */ | 271 | /* SMP bootup changes these two */ |
| 256 | __REFDATA | 272 | __REFDATA |
| 257 | .align 8 | 273 | .align 8 |
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 1460a5df92f7..e28670f9a589 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c | |||
| @@ -434,7 +434,7 @@ void hpet_msi_unmask(struct irq_data *data) | |||
| 434 | 434 | ||
| 435 | /* unmask it */ | 435 | /* unmask it */ |
| 436 | cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); | 436 | cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); |
| 437 | cfg |= HPET_TN_FSB; | 437 | cfg |= HPET_TN_ENABLE | HPET_TN_FSB; |
| 438 | hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); | 438 | hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); |
| 439 | } | 439 | } |
| 440 | 440 | ||
| @@ -445,7 +445,7 @@ void hpet_msi_mask(struct irq_data *data) | |||
| 445 | 445 | ||
| 446 | /* mask it */ | 446 | /* mask it */ |
| 447 | cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); | 447 | cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); |
| 448 | cfg &= ~HPET_TN_FSB; | 448 | cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB); |
| 449 | hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); | 449 | hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); |
| 450 | } | 450 | } |
| 451 | 451 | ||
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 675a05012449..245a71db401a 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c | |||
| @@ -175,7 +175,11 @@ void __cpuinit fpu_init(void) | |||
| 175 | cr0 |= X86_CR0_EM; | 175 | cr0 |= X86_CR0_EM; |
| 176 | write_cr0(cr0); | 176 | write_cr0(cr0); |
| 177 | 177 | ||
| 178 | if (!smp_processor_id()) | 178 | /* |
| 179 | * init_thread_xstate is only called once to avoid overriding | ||
| 180 | * xstate_size during boot time or during CPU hotplug. | ||
| 181 | */ | ||
| 182 | if (xstate_size == 0) | ||
| 179 | init_thread_xstate(); | 183 | init_thread_xstate(); |
| 180 | 184 | ||
| 181 | mxcsr_feature_mask_init(); | 185 | mxcsr_feature_mask_init(); |
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 6e03b0d69138..7dc4e459c2b3 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c | |||
| @@ -42,39 +42,6 @@ | |||
| 42 | * (these are usually mapped into the 0x30-0xff vector range) | 42 | * (these are usually mapped into the 0x30-0xff vector range) |
| 43 | */ | 43 | */ |
| 44 | 44 | ||
| 45 | #ifdef CONFIG_X86_32 | ||
| 46 | /* | ||
| 47 | * Note that on a 486, we don't want to do a SIGFPE on an irq13 | ||
| 48 | * as the irq is unreliable, and exception 16 works correctly | ||
| 49 | * (ie as explained in the intel literature). On a 386, you | ||
| 50 | * can't use exception 16 due to bad IBM design, so we have to | ||
| 51 | * rely on the less exact irq13. | ||
| 52 | * | ||
| 53 | * Careful.. Not only is IRQ13 unreliable, but it is also | ||
| 54 | * leads to races. IBM designers who came up with it should | ||
| 55 | * be shot. | ||
| 56 | */ | ||
| 57 | |||
| 58 | static irqreturn_t math_error_irq(int cpl, void *dev_id) | ||
| 59 | { | ||
| 60 | outb(0, 0xF0); | ||
| 61 | if (ignore_fpu_irq || !boot_cpu_data.hard_math) | ||
| 62 | return IRQ_NONE; | ||
| 63 | math_error(get_irq_regs(), 0, X86_TRAP_MF); | ||
| 64 | return IRQ_HANDLED; | ||
| 65 | } | ||
| 66 | |||
| 67 | /* | ||
| 68 | * New motherboards sometimes make IRQ 13 be a PCI interrupt, | ||
| 69 | * so allow interrupt sharing. | ||
| 70 | */ | ||
| 71 | static struct irqaction fpu_irq = { | ||
| 72 | .handler = math_error_irq, | ||
| 73 | .name = "fpu", | ||
| 74 | .flags = IRQF_NO_THREAD, | ||
| 75 | }; | ||
| 76 | #endif | ||
| 77 | |||
| 78 | /* | 45 | /* |
| 79 | * IRQ2 is cascade interrupt to second interrupt controller | 46 | * IRQ2 is cascade interrupt to second interrupt controller |
| 80 | */ | 47 | */ |
| @@ -242,13 +209,6 @@ void __init native_init_IRQ(void) | |||
| 242 | setup_irq(2, &irq2); | 209 | setup_irq(2, &irq2); |
| 243 | 210 | ||
| 244 | #ifdef CONFIG_X86_32 | 211 | #ifdef CONFIG_X86_32 |
| 245 | /* | ||
| 246 | * External FPU? Set up irq13 if so, for | ||
| 247 | * original braindamaged IBM FERR coupling. | ||
| 248 | */ | ||
| 249 | if (boot_cpu_data.hard_math && !cpu_has_fpu) | ||
| 250 | setup_irq(FPU_IRQ, &fpu_irq); | ||
| 251 | |||
| 252 | irq_ctx_init(smp_processor_id()); | 212 | irq_ctx_init(smp_processor_id()); |
| 253 | #endif | 213 | #endif |
| 254 | } | 214 | } |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 4180a874c764..9c2bd8bd4b4c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
| @@ -42,6 +42,8 @@ | |||
| 42 | #include <asm/apic.h> | 42 | #include <asm/apic.h> |
| 43 | #include <asm/apicdef.h> | 43 | #include <asm/apicdef.h> |
| 44 | #include <asm/hypervisor.h> | 44 | #include <asm/hypervisor.h> |
| 45 | #include <asm/kvm_guest.h> | ||
| 46 | #include <asm/context_tracking.h> | ||
| 45 | 47 | ||
| 46 | static int kvmapf = 1; | 48 | static int kvmapf = 1; |
| 47 | 49 | ||
| @@ -62,6 +64,15 @@ static int parse_no_stealacc(char *arg) | |||
| 62 | 64 | ||
| 63 | early_param("no-steal-acc", parse_no_stealacc); | 65 | early_param("no-steal-acc", parse_no_stealacc); |
| 64 | 66 | ||
| 67 | static int kvmclock_vsyscall = 1; | ||
| 68 | static int parse_no_kvmclock_vsyscall(char *arg) | ||
| 69 | { | ||
| 70 | kvmclock_vsyscall = 0; | ||
| 71 | return 0; | ||
| 72 | } | ||
| 73 | |||
| 74 | early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); | ||
| 75 | |||
| 65 | static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); | 76 | static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); |
| 66 | static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); | 77 | static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); |
| 67 | static int has_steal_clock = 0; | 78 | static int has_steal_clock = 0; |
| @@ -110,11 +121,8 @@ void kvm_async_pf_task_wait(u32 token) | |||
| 110 | struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; | 121 | struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; |
| 111 | struct kvm_task_sleep_node n, *e; | 122 | struct kvm_task_sleep_node n, *e; |
| 112 | DEFINE_WAIT(wait); | 123 | DEFINE_WAIT(wait); |
| 113 | int cpu, idle; | ||
| 114 | 124 | ||
| 115 | cpu = get_cpu(); | 125 | rcu_irq_enter(); |
| 116 | idle = idle_cpu(cpu); | ||
| 117 | put_cpu(); | ||
| 118 | 126 | ||
| 119 | spin_lock(&b->lock); | 127 | spin_lock(&b->lock); |
| 120 | e = _find_apf_task(b, token); | 128 | e = _find_apf_task(b, token); |
| @@ -123,12 +131,14 @@ void kvm_async_pf_task_wait(u32 token) | |||
| 123 | hlist_del(&e->link); | 131 | hlist_del(&e->link); |
| 124 | kfree(e); | 132 | kfree(e); |
| 125 | spin_unlock(&b->lock); | 133 | spin_unlock(&b->lock); |
| 134 | |||
| 135 | rcu_irq_exit(); | ||
| 126 | return; | 136 | return; |
| 127 | } | 137 | } |
| 128 | 138 | ||
| 129 | n.token = token; | 139 | n.token = token; |
| 130 | n.cpu = smp_processor_id(); | 140 | n.cpu = smp_processor_id(); |
| 131 | n.halted = idle || preempt_count() > 1; | 141 | n.halted = is_idle_task(current) || preempt_count() > 1; |
| 132 | init_waitqueue_head(&n.wq); | 142 | init_waitqueue_head(&n.wq); |
| 133 | hlist_add_head(&n.link, &b->list); | 143 | hlist_add_head(&n.link, &b->list); |
| 134 | spin_unlock(&b->lock); | 144 | spin_unlock(&b->lock); |
| @@ -147,13 +157,16 @@ void kvm_async_pf_task_wait(u32 token) | |||
| 147 | /* | 157 | /* |
| 148 | * We cannot reschedule. So halt. | 158 | * We cannot reschedule. So halt. |
| 149 | */ | 159 | */ |
| 160 | rcu_irq_exit(); | ||
| 150 | native_safe_halt(); | 161 | native_safe_halt(); |
| 162 | rcu_irq_enter(); | ||
| 151 | local_irq_disable(); | 163 | local_irq_disable(); |
| 152 | } | 164 | } |
| 153 | } | 165 | } |
| 154 | if (!n.halted) | 166 | if (!n.halted) |
| 155 | finish_wait(&n.wq, &wait); | 167 | finish_wait(&n.wq, &wait); |
| 156 | 168 | ||
| 169 | rcu_irq_exit(); | ||
| 157 | return; | 170 | return; |
| 158 | } | 171 | } |
| 159 | EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); | 172 | EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); |
| @@ -247,10 +260,10 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
| 247 | break; | 260 | break; |
| 248 | case KVM_PV_REASON_PAGE_NOT_PRESENT: | 261 | case KVM_PV_REASON_PAGE_NOT_PRESENT: |
| 249 | /* page is swapped out by the host. */ | 262 | /* page is swapped out by the host. */ |
| 250 | rcu_irq_enter(); | 263 | exception_enter(regs); |
| 251 | exit_idle(); | 264 | exit_idle(); |
| 252 | kvm_async_pf_task_wait((u32)read_cr2()); | 265 | kvm_async_pf_task_wait((u32)read_cr2()); |
| 253 | rcu_irq_exit(); | 266 | exception_exit(regs); |
| 254 | break; | 267 | break; |
| 255 | case KVM_PV_REASON_PAGE_READY: | 268 | case KVM_PV_REASON_PAGE_READY: |
| 256 | rcu_irq_enter(); | 269 | rcu_irq_enter(); |
| @@ -471,6 +484,9 @@ void __init kvm_guest_init(void) | |||
| 471 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) | 484 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) |
| 472 | apic_set_eoi_write(kvm_guest_apic_eoi_write); | 485 | apic_set_eoi_write(kvm_guest_apic_eoi_write); |
| 473 | 486 | ||
| 487 | if (kvmclock_vsyscall) | ||
| 488 | kvm_setup_vsyscall_timeinfo(); | ||
| 489 | |||
| 474 | #ifdef CONFIG_SMP | 490 | #ifdef CONFIG_SMP |
| 475 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; | 491 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; |
| 476 | register_cpu_notifier(&kvm_cpu_notifier); | 492 | register_cpu_notifier(&kvm_cpu_notifier); |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f1b42b3a186c..220a360010f8 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <asm/apic.h> | 23 | #include <asm/apic.h> |
| 24 | #include <linux/percpu.h> | 24 | #include <linux/percpu.h> |
| 25 | #include <linux/hardirq.h> | 25 | #include <linux/hardirq.h> |
| 26 | #include <linux/memblock.h> | ||
| 26 | 27 | ||
| 27 | #include <asm/x86_init.h> | 28 | #include <asm/x86_init.h> |
| 28 | #include <asm/reboot.h> | 29 | #include <asm/reboot.h> |
| @@ -39,7 +40,7 @@ static int parse_no_kvmclock(char *arg) | |||
| 39 | early_param("no-kvmclock", parse_no_kvmclock); | 40 | early_param("no-kvmclock", parse_no_kvmclock); |
| 40 | 41 | ||
| 41 | /* The hypervisor will put information about time periodically here */ | 42 | /* The hypervisor will put information about time periodically here */ |
| 42 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); | 43 | static struct pvclock_vsyscall_time_info *hv_clock; |
| 43 | static struct pvclock_wall_clock wall_clock; | 44 | static struct pvclock_wall_clock wall_clock; |
| 44 | 45 | ||
| 45 | /* | 46 | /* |
| @@ -52,15 +53,20 @@ static unsigned long kvm_get_wallclock(void) | |||
| 52 | struct pvclock_vcpu_time_info *vcpu_time; | 53 | struct pvclock_vcpu_time_info *vcpu_time; |
| 53 | struct timespec ts; | 54 | struct timespec ts; |
| 54 | int low, high; | 55 | int low, high; |
| 56 | int cpu; | ||
| 55 | 57 | ||
| 56 | low = (int)__pa_symbol(&wall_clock); | 58 | low = (int)__pa_symbol(&wall_clock); |
| 57 | high = ((u64)__pa_symbol(&wall_clock) >> 32); | 59 | high = ((u64)__pa_symbol(&wall_clock) >> 32); |
| 58 | 60 | ||
| 59 | native_write_msr(msr_kvm_wall_clock, low, high); | 61 | native_write_msr(msr_kvm_wall_clock, low, high); |
| 60 | 62 | ||
| 61 | vcpu_time = &get_cpu_var(hv_clock); | 63 | preempt_disable(); |
| 64 | cpu = smp_processor_id(); | ||
| 65 | |||
| 66 | vcpu_time = &hv_clock[cpu].pvti; | ||
| 62 | pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); | 67 | pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); |
| 63 | put_cpu_var(hv_clock); | 68 | |
| 69 | preempt_enable(); | ||
| 64 | 70 | ||
| 65 | return ts.tv_sec; | 71 | return ts.tv_sec; |
| 66 | } | 72 | } |
| @@ -74,9 +80,11 @@ static cycle_t kvm_clock_read(void) | |||
| 74 | { | 80 | { |
| 75 | struct pvclock_vcpu_time_info *src; | 81 | struct pvclock_vcpu_time_info *src; |
| 76 | cycle_t ret; | 82 | cycle_t ret; |
| 83 | int cpu; | ||
| 77 | 84 | ||
| 78 | preempt_disable_notrace(); | 85 | preempt_disable_notrace(); |
| 79 | src = &__get_cpu_var(hv_clock); | 86 | cpu = smp_processor_id(); |
| 87 | src = &hv_clock[cpu].pvti; | ||
| 80 | ret = pvclock_clocksource_read(src); | 88 | ret = pvclock_clocksource_read(src); |
| 81 | preempt_enable_notrace(); | 89 | preempt_enable_notrace(); |
| 82 | return ret; | 90 | return ret; |
| @@ -99,8 +107,15 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs) | |||
| 99 | static unsigned long kvm_get_tsc_khz(void) | 107 | static unsigned long kvm_get_tsc_khz(void) |
| 100 | { | 108 | { |
| 101 | struct pvclock_vcpu_time_info *src; | 109 | struct pvclock_vcpu_time_info *src; |
| 102 | src = &per_cpu(hv_clock, 0); | 110 | int cpu; |
| 103 | return pvclock_tsc_khz(src); | 111 | unsigned long tsc_khz; |
| 112 | |||
| 113 | preempt_disable(); | ||
| 114 | cpu = smp_processor_id(); | ||
| 115 | src = &hv_clock[cpu].pvti; | ||
| 116 | tsc_khz = pvclock_tsc_khz(src); | ||
| 117 | preempt_enable(); | ||
| 118 | return tsc_khz; | ||
| 104 | } | 119 | } |
| 105 | 120 | ||
| 106 | static void kvm_get_preset_lpj(void) | 121 | static void kvm_get_preset_lpj(void) |
| @@ -119,10 +134,14 @@ bool kvm_check_and_clear_guest_paused(void) | |||
| 119 | { | 134 | { |
| 120 | bool ret = false; | 135 | bool ret = false; |
| 121 | struct pvclock_vcpu_time_info *src; | 136 | struct pvclock_vcpu_time_info *src; |
| 137 | int cpu = smp_processor_id(); | ||
| 122 | 138 | ||
| 123 | src = &__get_cpu_var(hv_clock); | 139 | if (!hv_clock) |
| 140 | return ret; | ||
| 141 | |||
| 142 | src = &hv_clock[cpu].pvti; | ||
| 124 | if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { | 143 | if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { |
| 125 | __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); | 144 | src->flags &= ~PVCLOCK_GUEST_STOPPED; |
| 126 | ret = true; | 145 | ret = true; |
| 127 | } | 146 | } |
| 128 | 147 | ||
| @@ -141,9 +160,10 @@ int kvm_register_clock(char *txt) | |||
| 141 | { | 160 | { |
| 142 | int cpu = smp_processor_id(); | 161 | int cpu = smp_processor_id(); |
| 143 | int low, high, ret; | 162 | int low, high, ret; |
| 163 | struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; | ||
| 144 | 164 | ||
| 145 | low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; | 165 | low = (int)__pa(src) | 1; |
| 146 | high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); | 166 | high = ((u64)__pa(src) >> 32); |
| 147 | ret = native_write_msr_safe(msr_kvm_system_time, low, high); | 167 | ret = native_write_msr_safe(msr_kvm_system_time, low, high); |
| 148 | printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", | 168 | printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", |
| 149 | cpu, high, low, txt); | 169 | cpu, high, low, txt); |
| @@ -197,6 +217,8 @@ static void kvm_shutdown(void) | |||
| 197 | 217 | ||
| 198 | void __init kvmclock_init(void) | 218 | void __init kvmclock_init(void) |
| 199 | { | 219 | { |
| 220 | unsigned long mem; | ||
| 221 | |||
| 200 | if (!kvm_para_available()) | 222 | if (!kvm_para_available()) |
| 201 | return; | 223 | return; |
| 202 | 224 | ||
| @@ -209,8 +231,18 @@ void __init kvmclock_init(void) | |||
| 209 | printk(KERN_INFO "kvm-clock: Using msrs %x and %x", | 231 | printk(KERN_INFO "kvm-clock: Using msrs %x and %x", |
| 210 | msr_kvm_system_time, msr_kvm_wall_clock); | 232 | msr_kvm_system_time, msr_kvm_wall_clock); |
| 211 | 233 | ||
| 212 | if (kvm_register_clock("boot clock")) | 234 | mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS, |
| 235 | PAGE_SIZE); | ||
| 236 | if (!mem) | ||
| 237 | return; | ||
| 238 | hv_clock = __va(mem); | ||
| 239 | |||
| 240 | if (kvm_register_clock("boot clock")) { | ||
| 241 | hv_clock = NULL; | ||
| 242 | memblock_free(mem, | ||
| 243 | sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); | ||
| 213 | return; | 244 | return; |
| 245 | } | ||
| 214 | pv_time_ops.sched_clock = kvm_clock_read; | 246 | pv_time_ops.sched_clock = kvm_clock_read; |
| 215 | x86_platform.calibrate_tsc = kvm_get_tsc_khz; | 247 | x86_platform.calibrate_tsc = kvm_get_tsc_khz; |
| 216 | x86_platform.get_wallclock = kvm_get_wallclock; | 248 | x86_platform.get_wallclock = kvm_get_wallclock; |
| @@ -233,3 +265,37 @@ void __init kvmclock_init(void) | |||
| 233 | if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) | 265 | if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) |
| 234 | pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); | 266 | pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); |
| 235 | } | 267 | } |
| 268 | |||
| 269 | int __init kvm_setup_vsyscall_timeinfo(void) | ||
| 270 | { | ||
| 271 | #ifdef CONFIG_X86_64 | ||
| 272 | int cpu; | ||
| 273 | int ret; | ||
| 274 | u8 flags; | ||
| 275 | struct pvclock_vcpu_time_info *vcpu_time; | ||
| 276 | unsigned int size; | ||
| 277 | |||
| 278 | size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS; | ||
| 279 | |||
| 280 | preempt_disable(); | ||
| 281 | cpu = smp_processor_id(); | ||
| 282 | |||
| 283 | vcpu_time = &hv_clock[cpu].pvti; | ||
| 284 | flags = pvclock_read_flags(vcpu_time); | ||
| 285 | |||
| 286 | if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { | ||
| 287 | preempt_enable(); | ||
| 288 | return 1; | ||
| 289 | } | ||
| 290 | |||
| 291 | if ((ret = pvclock_init_vsyscall(hv_clock, size))) { | ||
| 292 | preempt_enable(); | ||
| 293 | return ret; | ||
| 294 | } | ||
| 295 | |||
| 296 | preempt_enable(); | ||
| 297 | |||
| 298 | kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; | ||
| 299 | #endif | ||
| 300 | return 0; | ||
| 301 | } | ||
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 7720ff5a9ee2..efdec7cd8e01 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c | |||
| @@ -8,8 +8,8 @@ | |||
| 8 | * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> | 8 | * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> |
| 9 | * | 9 | * |
| 10 | * Maintainers: | 10 | * Maintainers: |
| 11 | * Andreas Herrmann <andreas.herrmann3@amd.com> | 11 | * Andreas Herrmann <herrmann.der.user@googlemail.com> |
| 12 | * Borislav Petkov <borislav.petkov@amd.com> | 12 | * Borislav Petkov <bp@alien8.de> |
| 13 | * | 13 | * |
| 14 | * This driver allows to upgrade microcode on F10h AMD | 14 | * This driver allows to upgrade microcode on F10h AMD |
| 15 | * CPUs and later. | 15 | * CPUs and later. |
| @@ -190,6 +190,7 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, | |||
| 190 | #define F1XH_MPB_MAX_SIZE 2048 | 190 | #define F1XH_MPB_MAX_SIZE 2048 |
| 191 | #define F14H_MPB_MAX_SIZE 1824 | 191 | #define F14H_MPB_MAX_SIZE 1824 |
| 192 | #define F15H_MPB_MAX_SIZE 4096 | 192 | #define F15H_MPB_MAX_SIZE 4096 |
| 193 | #define F16H_MPB_MAX_SIZE 3458 | ||
| 193 | 194 | ||
| 194 | switch (c->x86) { | 195 | switch (c->x86) { |
| 195 | case 0x14: | 196 | case 0x14: |
| @@ -198,6 +199,9 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, | |||
| 198 | case 0x15: | 199 | case 0x15: |
| 199 | max_size = F15H_MPB_MAX_SIZE; | 200 | max_size = F15H_MPB_MAX_SIZE; |
| 200 | break; | 201 | break; |
| 202 | case 0x16: | ||
| 203 | max_size = F16H_MPB_MAX_SIZE; | ||
| 204 | break; | ||
| 201 | default: | 205 | default: |
| 202 | max_size = F1XH_MPB_MAX_SIZE; | 206 | max_size = F1XH_MPB_MAX_SIZE; |
| 203 | break; | 207 | break; |
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index de2b7ad70273..0f5dec5c80e0 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c | |||
| @@ -265,7 +265,7 @@ rootfs_initcall(pci_iommu_init); | |||
| 265 | #ifdef CONFIG_PCI | 265 | #ifdef CONFIG_PCI |
| 266 | /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ | 266 | /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ |
| 267 | 267 | ||
| 268 | static __devinit void via_no_dac(struct pci_dev *dev) | 268 | static void via_no_dac(struct pci_dev *dev) |
| 269 | { | 269 | { |
| 270 | if (forbid_dac == 0) { | 270 | if (forbid_dac == 0) { |
| 271 | dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); | 271 | dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b644e1c765dc..2ed787f15bf0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
| @@ -262,36 +262,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, | |||
| 262 | propagate_user_return_notify(prev_p, next_p); | 262 | propagate_user_return_notify(prev_p, next_p); |
| 263 | } | 263 | } |
| 264 | 264 | ||
| 265 | int sys_fork(struct pt_regs *regs) | ||
| 266 | { | ||
| 267 | return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); | ||
| 268 | } | ||
| 269 | |||
| 270 | /* | ||
| 271 | * This is trivial, and on the face of it looks like it | ||
| 272 | * could equally well be done in user mode. | ||
| 273 | * | ||
| 274 | * Not so, for quite unobvious reasons - register pressure. | ||
| 275 | * In user mode vfork() cannot have a stack frame, and if | ||
| 276 | * done by calling the "clone()" system call directly, you | ||
| 277 | * do not have enough call-clobbered registers to hold all | ||
| 278 | * the information you need. | ||
| 279 | */ | ||
| 280 | int sys_vfork(struct pt_regs *regs) | ||
| 281 | { | ||
| 282 | return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, | ||
| 283 | NULL, NULL); | ||
| 284 | } | ||
| 285 | |||
| 286 | long | ||
| 287 | sys_clone(unsigned long clone_flags, unsigned long newsp, | ||
| 288 | void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) | ||
| 289 | { | ||
| 290 | if (!newsp) | ||
| 291 | newsp = regs->sp; | ||
| 292 | return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); | ||
| 293 | } | ||
| 294 | |||
| 295 | /* | 265 | /* |
| 296 | * Idle related variables and functions | 266 | * Idle related variables and functions |
| 297 | */ | 267 | */ |
| @@ -306,11 +276,6 @@ void (*pm_idle)(void); | |||
| 306 | EXPORT_SYMBOL(pm_idle); | 276 | EXPORT_SYMBOL(pm_idle); |
| 307 | #endif | 277 | #endif |
| 308 | 278 | ||
| 309 | static inline int hlt_use_halt(void) | ||
| 310 | { | ||
| 311 | return 1; | ||
| 312 | } | ||
| 313 | |||
| 314 | #ifndef CONFIG_SMP | 279 | #ifndef CONFIG_SMP |
| 315 | static inline void play_dead(void) | 280 | static inline void play_dead(void) |
| 316 | { | 281 | { |
| @@ -410,28 +375,22 @@ void cpu_idle(void) | |||
| 410 | */ | 375 | */ |
| 411 | void default_idle(void) | 376 | void default_idle(void) |
| 412 | { | 377 | { |
| 413 | if (hlt_use_halt()) { | 378 | trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); |
| 414 | trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); | 379 | trace_cpu_idle_rcuidle(1, smp_processor_id()); |
| 415 | trace_cpu_idle_rcuidle(1, smp_processor_id()); | 380 | current_thread_info()->status &= ~TS_POLLING; |
| 416 | current_thread_info()->status &= ~TS_POLLING; | 381 | /* |
| 417 | /* | 382 | * TS_POLLING-cleared state must be visible before we |
| 418 | * TS_POLLING-cleared state must be visible before we | 383 | * test NEED_RESCHED: |
| 419 | * test NEED_RESCHED: | 384 | */ |
| 420 | */ | 385 | smp_mb(); |
| 421 | smp_mb(); | ||
| 422 | 386 | ||
| 423 | if (!need_resched()) | 387 | if (!need_resched()) |
| 424 | safe_halt(); /* enables interrupts racelessly */ | 388 | safe_halt(); /* enables interrupts racelessly */ |
| 425 | else | 389 | else |
| 426 | local_irq_enable(); | ||
| 427 | current_thread_info()->status |= TS_POLLING; | ||
| 428 | trace_power_end_rcuidle(smp_processor_id()); | ||
| 429 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); | ||
| 430 | } else { | ||
| 431 | local_irq_enable(); | 390 | local_irq_enable(); |
| 432 | /* loop is done by the caller */ | 391 | current_thread_info()->status |= TS_POLLING; |
| 433 | cpu_relax(); | 392 | trace_power_end_rcuidle(smp_processor_id()); |
| 434 | } | 393 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
| 435 | } | 394 | } |
| 436 | #ifdef CONFIG_APM_MODULE | 395 | #ifdef CONFIG_APM_MODULE |
| 437 | EXPORT_SYMBOL(default_idle); | 396 | EXPORT_SYMBOL(default_idle); |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 44e0bff38e72..b5a8905785e6 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
| @@ -128,8 +128,7 @@ void release_thread(struct task_struct *dead_task) | |||
| 128 | } | 128 | } |
| 129 | 129 | ||
| 130 | int copy_thread(unsigned long clone_flags, unsigned long sp, | 130 | int copy_thread(unsigned long clone_flags, unsigned long sp, |
| 131 | unsigned long arg, | 131 | unsigned long arg, struct task_struct *p) |
| 132 | struct task_struct *p, struct pt_regs *regs) | ||
| 133 | { | 132 | { |
| 134 | struct pt_regs *childregs = task_pt_regs(p); | 133 | struct pt_regs *childregs = task_pt_regs(p); |
| 135 | struct task_struct *tsk; | 134 | struct task_struct *tsk; |
| @@ -138,7 +137,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, | |||
| 138 | p->thread.sp = (unsigned long) childregs; | 137 | p->thread.sp = (unsigned long) childregs; |
| 139 | p->thread.sp0 = (unsigned long) (childregs+1); | 138 | p->thread.sp0 = (unsigned long) (childregs+1); |
| 140 | 139 | ||
| 141 | if (unlikely(!regs)) { | 140 | if (unlikely(p->flags & PF_KTHREAD)) { |
| 142 | /* kernel thread */ | 141 | /* kernel thread */ |
| 143 | memset(childregs, 0, sizeof(struct pt_regs)); | 142 | memset(childregs, 0, sizeof(struct pt_regs)); |
| 144 | p->thread.ip = (unsigned long) ret_from_kernel_thread; | 143 | p->thread.ip = (unsigned long) ret_from_kernel_thread; |
| @@ -156,12 +155,13 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, | |||
| 156 | memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); | 155 | memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); |
| 157 | return 0; | 156 | return 0; |
| 158 | } | 157 | } |
| 159 | *childregs = *regs; | 158 | *childregs = *current_pt_regs(); |
| 160 | childregs->ax = 0; | 159 | childregs->ax = 0; |
| 161 | childregs->sp = sp; | 160 | if (sp) |
| 161 | childregs->sp = sp; | ||
| 162 | 162 | ||
| 163 | p->thread.ip = (unsigned long) ret_from_fork; | 163 | p->thread.ip = (unsigned long) ret_from_fork; |
| 164 | task_user_gs(p) = get_user_gs(regs); | 164 | task_user_gs(p) = get_user_gs(current_pt_regs()); |
| 165 | 165 | ||
| 166 | p->fpu_counter = 0; | 166 | p->fpu_counter = 0; |
| 167 | p->thread.io_bitmap_ptr = NULL; | 167 | p->thread.io_bitmap_ptr = NULL; |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 16c6365e2b86..6e68a6194965 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
| @@ -146,8 +146,7 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls) | |||
| 146 | } | 146 | } |
| 147 | 147 | ||
| 148 | int copy_thread(unsigned long clone_flags, unsigned long sp, | 148 | int copy_thread(unsigned long clone_flags, unsigned long sp, |
| 149 | unsigned long arg, | 149 | unsigned long arg, struct task_struct *p) |
| 150 | struct task_struct *p, struct pt_regs *regs) | ||
| 151 | { | 150 | { |
| 152 | int err; | 151 | int err; |
| 153 | struct pt_regs *childregs; | 152 | struct pt_regs *childregs; |
| @@ -169,7 +168,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, | |||
| 169 | savesegment(ds, p->thread.ds); | 168 | savesegment(ds, p->thread.ds); |
| 170 | memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); | 169 | memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); |
| 171 | 170 | ||
| 172 | if (unlikely(!regs)) { | 171 | if (unlikely(p->flags & PF_KTHREAD)) { |
| 173 | /* kernel thread */ | 172 | /* kernel thread */ |
| 174 | memset(childregs, 0, sizeof(struct pt_regs)); | 173 | memset(childregs, 0, sizeof(struct pt_regs)); |
| 175 | childregs->sp = (unsigned long)childregs; | 174 | childregs->sp = (unsigned long)childregs; |
| @@ -181,10 +180,11 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, | |||
| 181 | childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1; | 180 | childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1; |
| 182 | return 0; | 181 | return 0; |
| 183 | } | 182 | } |
| 184 | *childregs = *regs; | 183 | *childregs = *current_pt_regs(); |
| 185 | 184 | ||
| 186 | childregs->ax = 0; | 185 | childregs->ax = 0; |
| 187 | childregs->sp = sp; | 186 | if (sp) |
| 187 | childregs->sp = sp; | ||
| 188 | 188 | ||
| 189 | err = -ENOMEM; | 189 | err = -ENOMEM; |
| 190 | memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); | 190 | memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); |
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b00b33a18390..b629bbe0d9bd 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c | |||
| @@ -22,6 +22,8 @@ | |||
| 22 | #include <linux/perf_event.h> | 22 | #include <linux/perf_event.h> |
| 23 | #include <linux/hw_breakpoint.h> | 23 | #include <linux/hw_breakpoint.h> |
| 24 | #include <linux/rcupdate.h> | 24 | #include <linux/rcupdate.h> |
| 25 | #include <linux/module.h> | ||
| 26 | #include <linux/context_tracking.h> | ||
| 25 | 27 | ||
| 26 | #include <asm/uaccess.h> | 28 | #include <asm/uaccess.h> |
| 27 | #include <asm/pgtable.h> | 29 | #include <asm/pgtable.h> |
| @@ -166,6 +168,35 @@ static inline bool invalid_selector(u16 value) | |||
| 166 | 168 | ||
| 167 | #define FLAG_MASK FLAG_MASK_32 | 169 | #define FLAG_MASK FLAG_MASK_32 |
| 168 | 170 | ||
| 171 | /* | ||
| 172 | * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode | ||
| 173 | * when it traps. The previous stack will be directly underneath the saved | ||
| 174 | * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. | ||
| 175 | * | ||
| 176 | * Now, if the stack is empty, '®s->sp' is out of range. In this | ||
| 177 | * case we try to take the previous stack. To always return a non-null | ||
| 178 | * stack pointer we fall back to regs as stack if no previous stack | ||
| 179 | * exists. | ||
| 180 | * | ||
| 181 | * This is valid only for kernel mode traps. | ||
| 182 | */ | ||
| 183 | unsigned long kernel_stack_pointer(struct pt_regs *regs) | ||
| 184 | { | ||
| 185 | unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1); | ||
| 186 | unsigned long sp = (unsigned long)®s->sp; | ||
| 187 | struct thread_info *tinfo; | ||
| 188 | |||
| 189 | if (context == (sp & ~(THREAD_SIZE - 1))) | ||
| 190 | return sp; | ||
| 191 | |||
| 192 | tinfo = (struct thread_info *)context; | ||
| 193 | if (tinfo->previous_esp) | ||
| 194 | return tinfo->previous_esp; | ||
| 195 | |||
| 196 | return (unsigned long)regs; | ||
| 197 | } | ||
| 198 | EXPORT_SYMBOL_GPL(kernel_stack_pointer); | ||
| 199 | |||
| 169 | static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) | 200 | static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) |
| 170 | { | 201 | { |
| 171 | BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); | 202 | BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); |
| @@ -1461,7 +1492,7 @@ long syscall_trace_enter(struct pt_regs *regs) | |||
| 1461 | { | 1492 | { |
| 1462 | long ret = 0; | 1493 | long ret = 0; |
| 1463 | 1494 | ||
| 1464 | rcu_user_exit(); | 1495 | user_exit(); |
| 1465 | 1496 | ||
| 1466 | /* | 1497 | /* |
| 1467 | * If we stepped into a sysenter/syscall insn, it trapped in | 1498 | * If we stepped into a sysenter/syscall insn, it trapped in |
| @@ -1511,6 +1542,13 @@ void syscall_trace_leave(struct pt_regs *regs) | |||
| 1511 | { | 1542 | { |
| 1512 | bool step; | 1543 | bool step; |
| 1513 | 1544 | ||
| 1545 | /* | ||
| 1546 | * We may come here right after calling schedule_user() | ||
| 1547 | * or do_notify_resume(), in which case we can be in RCU | ||
| 1548 | * user mode. | ||
| 1549 | */ | ||
| 1550 | user_exit(); | ||
| 1551 | |||
| 1514 | audit_syscall_exit(regs); | 1552 | audit_syscall_exit(regs); |
| 1515 | 1553 | ||
| 1516 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) | 1554 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
| @@ -1527,5 +1565,5 @@ void syscall_trace_leave(struct pt_regs *regs) | |||
| 1527 | if (step || test_thread_flag(TIF_SYSCALL_TRACE)) | 1565 | if (step || test_thread_flag(TIF_SYSCALL_TRACE)) |
| 1528 | tracehook_report_syscall_exit(regs, step); | 1566 | tracehook_report_syscall_exit(regs, step); |
| 1529 | 1567 | ||
| 1530 | rcu_user_enter(); | 1568 | user_enter(); |
| 1531 | } | 1569 | } |
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 42eb3300dfc6..85c39590c1a4 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c | |||
| @@ -17,23 +17,13 @@ | |||
| 17 | 17 | ||
| 18 | #include <linux/kernel.h> | 18 | #include <linux/kernel.h> |
| 19 | #include <linux/percpu.h> | 19 | #include <linux/percpu.h> |
| 20 | #include <linux/notifier.h> | ||
| 21 | #include <linux/sched.h> | ||
| 22 | #include <linux/gfp.h> | ||
| 23 | #include <linux/bootmem.h> | ||
| 24 | #include <asm/fixmap.h> | ||
| 20 | #include <asm/pvclock.h> | 25 | #include <asm/pvclock.h> |
| 21 | 26 | ||
| 22 | /* | ||
| 23 | * These are perodically updated | ||
| 24 | * xen: magic shared_info page | ||
| 25 | * kvm: gpa registered via msr | ||
| 26 | * and then copied here. | ||
| 27 | */ | ||
| 28 | struct pvclock_shadow_time { | ||
| 29 | u64 tsc_timestamp; /* TSC at last update of time vals. */ | ||
| 30 | u64 system_timestamp; /* Time, in nanosecs, since boot. */ | ||
| 31 | u32 tsc_to_nsec_mul; | ||
| 32 | int tsc_shift; | ||
| 33 | u32 version; | ||
| 34 | u8 flags; | ||
| 35 | }; | ||
| 36 | |||
| 37 | static u8 valid_flags __read_mostly = 0; | 27 | static u8 valid_flags __read_mostly = 0; |
| 38 | 28 | ||
| 39 | void pvclock_set_flags(u8 flags) | 29 | void pvclock_set_flags(u8 flags) |
| @@ -41,34 +31,6 @@ void pvclock_set_flags(u8 flags) | |||
| 41 | valid_flags = flags; | 31 | valid_flags = flags; |
| 42 | } | 32 | } |
| 43 | 33 | ||
| 44 | static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) | ||
| 45 | { | ||
| 46 | u64 delta = native_read_tsc() - shadow->tsc_timestamp; | ||
| 47 | return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul, | ||
| 48 | shadow->tsc_shift); | ||
| 49 | } | ||
| 50 | |||
| 51 | /* | ||
| 52 | * Reads a consistent set of time-base values from hypervisor, | ||
| 53 | * into a shadow data area. | ||
| 54 | */ | ||
| 55 | static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, | ||
| 56 | struct pvclock_vcpu_time_info *src) | ||
| 57 | { | ||
| 58 | do { | ||
| 59 | dst->version = src->version; | ||
| 60 | rmb(); /* fetch version before data */ | ||
| 61 | dst->tsc_timestamp = src->tsc_timestamp; | ||
| 62 | dst->system_timestamp = src->system_time; | ||
| 63 | dst->tsc_to_nsec_mul = src->tsc_to_system_mul; | ||
| 64 | dst->tsc_shift = src->tsc_shift; | ||
| 65 | dst->flags = src->flags; | ||
| 66 | rmb(); /* test version after fetching data */ | ||
| 67 | } while ((src->version & 1) || (dst->version != src->version)); | ||
| 68 | |||
| 69 | return dst->version; | ||
| 70 | } | ||
| 71 | |||
| 72 | unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) | 34 | unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) |
| 73 | { | 35 | { |
| 74 | u64 pv_tsc_khz = 1000000ULL << 32; | 36 | u64 pv_tsc_khz = 1000000ULL << 32; |
| @@ -88,23 +50,32 @@ void pvclock_resume(void) | |||
| 88 | atomic64_set(&last_value, 0); | 50 | atomic64_set(&last_value, 0); |
| 89 | } | 51 | } |
| 90 | 52 | ||
| 53 | u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) | ||
| 54 | { | ||
| 55 | unsigned version; | ||
| 56 | cycle_t ret; | ||
| 57 | u8 flags; | ||
| 58 | |||
| 59 | do { | ||
| 60 | version = __pvclock_read_cycles(src, &ret, &flags); | ||
| 61 | } while ((src->version & 1) || version != src->version); | ||
| 62 | |||
| 63 | return flags & valid_flags; | ||
| 64 | } | ||
| 65 | |||
| 91 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) | 66 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) |
| 92 | { | 67 | { |
| 93 | struct pvclock_shadow_time shadow; | ||
| 94 | unsigned version; | 68 | unsigned version; |
| 95 | cycle_t ret, offset; | 69 | cycle_t ret; |
| 96 | u64 last; | 70 | u64 last; |
| 71 | u8 flags; | ||
| 97 | 72 | ||
| 98 | do { | 73 | do { |
| 99 | version = pvclock_get_time_values(&shadow, src); | 74 | version = __pvclock_read_cycles(src, &ret, &flags); |
| 100 | barrier(); | 75 | } while ((src->version & 1) || version != src->version); |
| 101 | offset = pvclock_get_nsec_offset(&shadow); | ||
| 102 | ret = shadow.system_timestamp + offset; | ||
| 103 | barrier(); | ||
| 104 | } while (version != src->version); | ||
| 105 | 76 | ||
| 106 | if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && | 77 | if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && |
| 107 | (shadow.flags & PVCLOCK_TSC_STABLE_BIT)) | 78 | (flags & PVCLOCK_TSC_STABLE_BIT)) |
| 108 | return ret; | 79 | return ret; |
| 109 | 80 | ||
| 110 | /* | 81 | /* |
| @@ -156,3 +127,71 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, | |||
| 156 | 127 | ||
| 157 | set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); | 128 | set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); |
| 158 | } | 129 | } |
| 130 | |||
| 131 | static struct pvclock_vsyscall_time_info *pvclock_vdso_info; | ||
| 132 | |||
| 133 | static struct pvclock_vsyscall_time_info * | ||
| 134 | pvclock_get_vsyscall_user_time_info(int cpu) | ||
| 135 | { | ||
| 136 | if (!pvclock_vdso_info) { | ||
| 137 | BUG(); | ||
| 138 | return NULL; | ||
| 139 | } | ||
| 140 | |||
| 141 | return &pvclock_vdso_info[cpu]; | ||
| 142 | } | ||
| 143 | |||
| 144 | struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) | ||
| 145 | { | ||
| 146 | return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; | ||
| 147 | } | ||
| 148 | |||
| 149 | #ifdef CONFIG_X86_64 | ||
| 150 | static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, | ||
| 151 | void *v) | ||
| 152 | { | ||
| 153 | struct task_migration_notifier *mn = v; | ||
| 154 | struct pvclock_vsyscall_time_info *pvti; | ||
| 155 | |||
| 156 | pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); | ||
| 157 | |||
| 158 | /* this is NULL when pvclock vsyscall is not initialized */ | ||
| 159 | if (unlikely(pvti == NULL)) | ||
| 160 | return NOTIFY_DONE; | ||
| 161 | |||
| 162 | pvti->migrate_count++; | ||
| 163 | |||
| 164 | return NOTIFY_DONE; | ||
| 165 | } | ||
| 166 | |||
| 167 | static struct notifier_block pvclock_migrate = { | ||
| 168 | .notifier_call = pvclock_task_migrate, | ||
| 169 | }; | ||
| 170 | |||
| 171 | /* | ||
| 172 | * Initialize the generic pvclock vsyscall state. This will allocate | ||
| 173 | * a/some page(s) for the per-vcpu pvclock information, set up a | ||
| 174 | * fixmap mapping for the page(s) | ||
| 175 | */ | ||
| 176 | |||
| 177 | int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, | ||
| 178 | int size) | ||
| 179 | { | ||
| 180 | int idx; | ||
| 181 | |||
| 182 | WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); | ||
| 183 | |||
| 184 | pvclock_vdso_info = i; | ||
| 185 | |||
| 186 | for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { | ||
| 187 | __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, | ||
| 188 | __pa_symbol(i) + (idx*PAGE_SIZE), | ||
| 189 | PAGE_KERNEL_VVAR); | ||
| 190 | } | ||
| 191 | |||
| 192 | |||
| 193 | register_task_migration_notifier(&pvclock_migrate); | ||
| 194 | |||
| 195 | return 0; | ||
| 196 | } | ||
| 197 | #endif | ||
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 1b27de563561..26ee48a33dc4 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c | |||
| @@ -8,7 +8,7 @@ | |||
| 8 | 8 | ||
| 9 | #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) | 9 | #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) |
| 10 | 10 | ||
| 11 | static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) | 11 | static void quirk_intel_irqbalance(struct pci_dev *dev) |
| 12 | { | 12 | { |
| 13 | u8 config; | 13 | u8 config; |
| 14 | u16 word; | 14 | u16 word; |
| @@ -512,7 +512,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, | |||
| 512 | 512 | ||
| 513 | #if defined(CONFIG_PCI) && defined(CONFIG_NUMA) | 513 | #if defined(CONFIG_PCI) && defined(CONFIG_NUMA) |
| 514 | /* Set correct numa_node information for AMD NB functions */ | 514 | /* Set correct numa_node information for AMD NB functions */ |
| 515 | static void __devinit quirk_amd_nb_node(struct pci_dev *dev) | 515 | static void quirk_amd_nb_node(struct pci_dev *dev) |
| 516 | { | 516 | { |
| 517 | struct pci_dev *nb_ht; | 517 | struct pci_dev *nb_ht; |
| 518 | unsigned int devfn; | 518 | unsigned int devfn; |
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 4929c1be0ac0..801602b5d745 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c | |||
| @@ -195,12 +195,6 @@ void read_persistent_clock(struct timespec *ts) | |||
| 195 | ts->tv_nsec = 0; | 195 | ts->tv_nsec = 0; |
| 196 | } | 196 | } |
| 197 | 197 | ||
| 198 | unsigned long long native_read_tsc(void) | ||
| 199 | { | ||
| 200 | return __native_read_tsc(); | ||
| 201 | } | ||
| 202 | EXPORT_SYMBOL(native_read_tsc); | ||
| 203 | |||
| 204 | 198 | ||
| 205 | static struct resource rtc_resources[] = { | 199 | static struct resource rtc_resources[] = { |
| 206 | [0] = { | 200 | [0] = { |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ca45696f30fb..00f6c1472b85 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
| @@ -143,11 +143,7 @@ int default_check_phys_apicid_present(int phys_apicid) | |||
| 143 | } | 143 | } |
| 144 | #endif | 144 | #endif |
| 145 | 145 | ||
| 146 | #ifndef CONFIG_DEBUG_BOOT_PARAMS | ||
| 147 | struct boot_params __initdata boot_params; | ||
| 148 | #else | ||
| 149 | struct boot_params boot_params; | 146 | struct boot_params boot_params; |
| 150 | #endif | ||
| 151 | 147 | ||
| 152 | /* | 148 | /* |
| 153 | * Machine setup.. | 149 | * Machine setup.. |
| @@ -614,6 +610,83 @@ static __init void reserve_ibft_region(void) | |||
| 614 | 610 | ||
| 615 | static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; | 611 | static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; |
| 616 | 612 | ||
| 613 | static bool __init snb_gfx_workaround_needed(void) | ||
| 614 | { | ||
| 615 | #ifdef CONFIG_PCI | ||
| 616 | int i; | ||
| 617 | u16 vendor, devid; | ||
| 618 | static const __initconst u16 snb_ids[] = { | ||
| 619 | 0x0102, | ||
| 620 | 0x0112, | ||
| 621 | 0x0122, | ||
| 622 | 0x0106, | ||
| 623 | 0x0116, | ||
| 624 | 0x0126, | ||
| 625 | 0x010a, | ||
| 626 | }; | ||
| 627 | |||
| 628 | /* Assume no if something weird is going on with PCI */ | ||
| 629 | if (!early_pci_allowed()) | ||
| 630 | return false; | ||
| 631 | |||
| 632 | vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID); | ||
| 633 | if (vendor != 0x8086) | ||
| 634 | return false; | ||
| 635 | |||
| 636 | devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID); | ||
| 637 | for (i = 0; i < ARRAY_SIZE(snb_ids); i++) | ||
| 638 | if (devid == snb_ids[i]) | ||
| 639 | return true; | ||
| 640 | #endif | ||
| 641 | |||
| 642 | return false; | ||
| 643 | } | ||
| 644 | |||
| 645 | /* | ||
| 646 | * Sandy Bridge graphics has trouble with certain ranges, exclude | ||
| 647 | * them from allocation. | ||
| 648 | */ | ||
| 649 | static void __init trim_snb_memory(void) | ||
| 650 | { | ||
| 651 | static const __initconst unsigned long bad_pages[] = { | ||
| 652 | 0x20050000, | ||
| 653 | 0x20110000, | ||
| 654 | 0x20130000, | ||
| 655 | 0x20138000, | ||
| 656 | 0x40004000, | ||
| 657 | }; | ||
| 658 | int i; | ||
| 659 | |||
| 660 | if (!snb_gfx_workaround_needed()) | ||
| 661 | return; | ||
| 662 | |||
| 663 | printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n"); | ||
| 664 | |||
| 665 | /* | ||
| 666 | * Reserve all memory below the 1 MB mark that has not | ||
| 667 | * already been reserved. | ||
| 668 | */ | ||
| 669 | memblock_reserve(0, 1<<20); | ||
| 670 | |||
| 671 | for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { | ||
| 672 | if (memblock_reserve(bad_pages[i], PAGE_SIZE)) | ||
| 673 | printk(KERN_WARNING "failed to reserve 0x%08lx\n", | ||
| 674 | bad_pages[i]); | ||
| 675 | } | ||
| 676 | } | ||
| 677 | |||
| 678 | /* | ||
| 679 | * Here we put platform-specific memory range workarounds, i.e. | ||
| 680 | * memory known to be corrupt or otherwise in need to be reserved on | ||
| 681 | * specific platforms. | ||
| 682 | * | ||
| 683 | * If this gets used more widely it could use a real dispatch mechanism. | ||
| 684 | */ | ||
| 685 | static void __init trim_platform_memory_ranges(void) | ||
| 686 | { | ||
| 687 | trim_snb_memory(); | ||
| 688 | } | ||
| 689 | |||
| 617 | static void __init trim_bios_range(void) | 690 | static void __init trim_bios_range(void) |
| 618 | { | 691 | { |
| 619 | /* | 692 | /* |
| @@ -634,6 +707,7 @@ static void __init trim_bios_range(void) | |||
| 634 | * take them out. | 707 | * take them out. |
| 635 | */ | 708 | */ |
| 636 | e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); | 709 | e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); |
| 710 | |||
| 637 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | 711 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); |
| 638 | } | 712 | } |
| 639 | 713 | ||
| @@ -912,6 +986,8 @@ void __init setup_arch(char **cmdline_p) | |||
| 912 | 986 | ||
| 913 | setup_real_mode(); | 987 | setup_real_mode(); |
| 914 | 988 | ||
| 989 | trim_platform_memory_ranges(); | ||
| 990 | |||
| 915 | init_gbpages(); | 991 | init_gbpages(); |
| 916 | 992 | ||
| 917 | /* max_pfn_mapped is updated here */ | 993 | /* max_pfn_mapped is updated here */ |
| @@ -956,6 +1032,10 @@ void __init setup_arch(char **cmdline_p) | |||
| 956 | 1032 | ||
| 957 | reserve_initrd(); | 1033 | reserve_initrd(); |
| 958 | 1034 | ||
| 1035 | #if defined(CONFIG_ACPI) && defined(CONFIG_BLK_DEV_INITRD) | ||
| 1036 | acpi_initrd_override((void *)initrd_start, initrd_end - initrd_start); | ||
| 1037 | #endif | ||
| 1038 | |||
| 959 | reserve_crashkernel(); | 1039 | reserve_crashkernel(); |
| 960 | 1040 | ||
| 961 | vsmp_init(); | 1041 | vsmp_init(); |
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 70b27ee6118e..d6bf1f34a6e9 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #include <linux/uaccess.h> | 22 | #include <linux/uaccess.h> |
| 23 | #include <linux/user-return-notifier.h> | 23 | #include <linux/user-return-notifier.h> |
| 24 | #include <linux/uprobes.h> | 24 | #include <linux/uprobes.h> |
| 25 | #include <linux/context_tracking.h> | ||
| 25 | 26 | ||
| 26 | #include <asm/processor.h> | 27 | #include <asm/processor.h> |
| 27 | #include <asm/ucontext.h> | 28 | #include <asm/ucontext.h> |
| @@ -363,10 +364,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | |||
| 363 | else | 364 | else |
| 364 | put_user_ex(0, &frame->uc.uc_flags); | 365 | put_user_ex(0, &frame->uc.uc_flags); |
| 365 | put_user_ex(0, &frame->uc.uc_link); | 366 | put_user_ex(0, &frame->uc.uc_link); |
| 366 | put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); | 367 | err |= __save_altstack(&frame->uc.uc_stack, regs->sp); |
| 367 | put_user_ex(sas_ss_flags(regs->sp), | ||
| 368 | &frame->uc.uc_stack.ss_flags); | ||
| 369 | put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); | ||
| 370 | 368 | ||
| 371 | /* Set up to return from userspace. */ | 369 | /* Set up to return from userspace. */ |
| 372 | restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); | 370 | restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); |
| @@ -413,7 +411,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | |||
| 413 | struct rt_sigframe __user *frame; | 411 | struct rt_sigframe __user *frame; |
| 414 | void __user *fp = NULL; | 412 | void __user *fp = NULL; |
| 415 | int err = 0; | 413 | int err = 0; |
| 416 | struct task_struct *me = current; | ||
| 417 | 414 | ||
| 418 | frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp); | 415 | frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp); |
| 419 | 416 | ||
| @@ -432,10 +429,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | |||
| 432 | else | 429 | else |
| 433 | put_user_ex(0, &frame->uc.uc_flags); | 430 | put_user_ex(0, &frame->uc.uc_flags); |
| 434 | put_user_ex(0, &frame->uc.uc_link); | 431 | put_user_ex(0, &frame->uc.uc_link); |
| 435 | put_user_ex(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); | 432 | err |= __save_altstack(&frame->uc.uc_stack, regs->sp); |
| 436 | put_user_ex(sas_ss_flags(regs->sp), | ||
| 437 | &frame->uc.uc_stack.ss_flags); | ||
| 438 | put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size); | ||
| 439 | 433 | ||
| 440 | /* Set up to return from userspace. If provided, use a stub | 434 | /* Set up to return from userspace. If provided, use a stub |
| 441 | already in userspace. */ | 435 | already in userspace. */ |
| @@ -502,10 +496,7 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, | |||
| 502 | else | 496 | else |
| 503 | put_user_ex(0, &frame->uc.uc_flags); | 497 | put_user_ex(0, &frame->uc.uc_flags); |
| 504 | put_user_ex(0, &frame->uc.uc_link); | 498 | put_user_ex(0, &frame->uc.uc_link); |
| 505 | put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); | 499 | err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp); |
| 506 | put_user_ex(sas_ss_flags(regs->sp), | ||
| 507 | &frame->uc.uc_stack.ss_flags); | ||
| 508 | put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); | ||
| 509 | put_user_ex(0, &frame->uc.uc__pad0); | 500 | put_user_ex(0, &frame->uc.uc__pad0); |
| 510 | 501 | ||
| 511 | if (ka->sa.sa_flags & SA_RESTORER) { | 502 | if (ka->sa.sa_flags & SA_RESTORER) { |
| @@ -602,13 +593,6 @@ sys_sigaction(int sig, const struct old_sigaction __user *act, | |||
| 602 | } | 593 | } |
| 603 | #endif /* CONFIG_X86_32 */ | 594 | #endif /* CONFIG_X86_32 */ |
| 604 | 595 | ||
| 605 | long | ||
| 606 | sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, | ||
| 607 | struct pt_regs *regs) | ||
| 608 | { | ||
| 609 | return do_sigaltstack(uss, uoss, regs->sp); | ||
| 610 | } | ||
| 611 | |||
| 612 | /* | 596 | /* |
| 613 | * Do a signal return; undo the signal stack. | 597 | * Do a signal return; undo the signal stack. |
| 614 | */ | 598 | */ |
| @@ -658,7 +642,7 @@ long sys_rt_sigreturn(struct pt_regs *regs) | |||
| 658 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) | 642 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) |
| 659 | goto badframe; | 643 | goto badframe; |
| 660 | 644 | ||
| 661 | if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) | 645 | if (restore_altstack(&frame->uc.uc_stack)) |
| 662 | goto badframe; | 646 | goto badframe; |
| 663 | 647 | ||
| 664 | return ax; | 648 | return ax; |
| @@ -816,7 +800,7 @@ static void do_signal(struct pt_regs *regs) | |||
| 816 | void | 800 | void |
| 817 | do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) | 801 | do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) |
| 818 | { | 802 | { |
| 819 | rcu_user_exit(); | 803 | user_exit(); |
| 820 | 804 | ||
| 821 | #ifdef CONFIG_X86_MCE | 805 | #ifdef CONFIG_X86_MCE |
| 822 | /* notify userspace of pending MCEs */ | 806 | /* notify userspace of pending MCEs */ |
| @@ -838,7 +822,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) | |||
| 838 | if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) | 822 | if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) |
| 839 | fire_user_return_notifiers(); | 823 | fire_user_return_notifiers(); |
| 840 | 824 | ||
| 841 | rcu_user_enter(); | 825 | user_enter(); |
| 842 | } | 826 | } |
| 843 | 827 | ||
| 844 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where) | 828 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where) |
| @@ -864,7 +848,6 @@ asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs) | |||
| 864 | struct rt_sigframe_x32 __user *frame; | 848 | struct rt_sigframe_x32 __user *frame; |
| 865 | sigset_t set; | 849 | sigset_t set; |
| 866 | unsigned long ax; | 850 | unsigned long ax; |
| 867 | struct pt_regs tregs; | ||
| 868 | 851 | ||
| 869 | frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); | 852 | frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); |
| 870 | 853 | ||
| @@ -878,8 +861,7 @@ asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs) | |||
| 878 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) | 861 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) |
| 879 | goto badframe; | 862 | goto badframe; |
| 880 | 863 | ||
| 881 | tregs = *regs; | 864 | if (compat_restore_altstack(&frame->uc.uc_stack)) |
| 882 | if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT) | ||
| 883 | goto badframe; | 865 | goto badframe; |
| 884 | 866 | ||
| 885 | return ax; | 867 | return ax; |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c80a33bc528b..ed0fe385289d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
| @@ -68,6 +68,8 @@ | |||
| 68 | #include <asm/mwait.h> | 68 | #include <asm/mwait.h> |
| 69 | #include <asm/apic.h> | 69 | #include <asm/apic.h> |
| 70 | #include <asm/io_apic.h> | 70 | #include <asm/io_apic.h> |
| 71 | #include <asm/i387.h> | ||
| 72 | #include <asm/fpu-internal.h> | ||
| 71 | #include <asm/setup.h> | 73 | #include <asm/setup.h> |
| 72 | #include <asm/uv/uv.h> | 74 | #include <asm/uv/uv.h> |
| 73 | #include <linux/mc146818rtc.h> | 75 | #include <linux/mc146818rtc.h> |
| @@ -125,8 +127,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); | |||
| 125 | atomic_t init_deasserted; | 127 | atomic_t init_deasserted; |
| 126 | 128 | ||
| 127 | /* | 129 | /* |
| 128 | * Report back to the Boot Processor. | 130 | * Report back to the Boot Processor during boot time or to the caller processor |
| 129 | * Running on AP. | 131 | * during CPU online. |
| 130 | */ | 132 | */ |
| 131 | static void __cpuinit smp_callin(void) | 133 | static void __cpuinit smp_callin(void) |
| 132 | { | 134 | { |
| @@ -138,15 +140,17 @@ static void __cpuinit smp_callin(void) | |||
| 138 | * we may get here before an INIT-deassert IPI reaches | 140 | * we may get here before an INIT-deassert IPI reaches |
| 139 | * our local APIC. We have to wait for the IPI or we'll | 141 | * our local APIC. We have to wait for the IPI or we'll |
| 140 | * lock up on an APIC access. | 142 | * lock up on an APIC access. |
| 143 | * | ||
| 144 | * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI. | ||
| 141 | */ | 145 | */ |
| 142 | if (apic->wait_for_init_deassert) | 146 | cpuid = smp_processor_id(); |
| 147 | if (apic->wait_for_init_deassert && cpuid != 0) | ||
| 143 | apic->wait_for_init_deassert(&init_deasserted); | 148 | apic->wait_for_init_deassert(&init_deasserted); |
| 144 | 149 | ||
| 145 | /* | 150 | /* |
| 146 | * (This works even if the APIC is not enabled.) | 151 | * (This works even if the APIC is not enabled.) |
| 147 | */ | 152 | */ |
| 148 | phys_id = read_apic_id(); | 153 | phys_id = read_apic_id(); |
| 149 | cpuid = smp_processor_id(); | ||
| 150 | if (cpumask_test_cpu(cpuid, cpu_callin_mask)) { | 154 | if (cpumask_test_cpu(cpuid, cpu_callin_mask)) { |
| 151 | panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, | 155 | panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, |
| 152 | phys_id, cpuid); | 156 | phys_id, cpuid); |
| @@ -228,6 +232,8 @@ static void __cpuinit smp_callin(void) | |||
| 228 | cpumask_set_cpu(cpuid, cpu_callin_mask); | 232 | cpumask_set_cpu(cpuid, cpu_callin_mask); |
| 229 | } | 233 | } |
| 230 | 234 | ||
| 235 | static int cpu0_logical_apicid; | ||
| 236 | static int enable_start_cpu0; | ||
| 231 | /* | 237 | /* |
| 232 | * Activate a secondary processor. | 238 | * Activate a secondary processor. |
| 233 | */ | 239 | */ |
| @@ -243,6 +249,8 @@ notrace static void __cpuinit start_secondary(void *unused) | |||
| 243 | preempt_disable(); | 249 | preempt_disable(); |
| 244 | smp_callin(); | 250 | smp_callin(); |
| 245 | 251 | ||
| 252 | enable_start_cpu0 = 0; | ||
| 253 | |||
| 246 | #ifdef CONFIG_X86_32 | 254 | #ifdef CONFIG_X86_32 |
| 247 | /* switch away from the initial page table */ | 255 | /* switch away from the initial page table */ |
| 248 | load_cr3(swapper_pg_dir); | 256 | load_cr3(swapper_pg_dir); |
| @@ -279,19 +287,30 @@ notrace static void __cpuinit start_secondary(void *unused) | |||
| 279 | cpu_idle(); | 287 | cpu_idle(); |
| 280 | } | 288 | } |
| 281 | 289 | ||
| 290 | void __init smp_store_boot_cpu_info(void) | ||
| 291 | { | ||
| 292 | int id = 0; /* CPU 0 */ | ||
| 293 | struct cpuinfo_x86 *c = &cpu_data(id); | ||
| 294 | |||
| 295 | *c = boot_cpu_data; | ||
| 296 | c->cpu_index = id; | ||
| 297 | } | ||
| 298 | |||
| 282 | /* | 299 | /* |
| 283 | * The bootstrap kernel entry code has set these up. Save them for | 300 | * The bootstrap kernel entry code has set these up. Save them for |
| 284 | * a given CPU | 301 | * a given CPU |
| 285 | */ | 302 | */ |
| 286 | |||
| 287 | void __cpuinit smp_store_cpu_info(int id) | 303 | void __cpuinit smp_store_cpu_info(int id) |
| 288 | { | 304 | { |
| 289 | struct cpuinfo_x86 *c = &cpu_data(id); | 305 | struct cpuinfo_x86 *c = &cpu_data(id); |
| 290 | 306 | ||
| 291 | *c = boot_cpu_data; | 307 | *c = boot_cpu_data; |
| 292 | c->cpu_index = id; | 308 | c->cpu_index = id; |
| 293 | if (id != 0) | 309 | /* |
| 294 | identify_secondary_cpu(c); | 310 | * During boot time, CPU0 has this setup already. Save the info when |
| 311 | * bringing up AP or offlined CPU0. | ||
| 312 | */ | ||
| 313 | identify_secondary_cpu(c); | ||
| 295 | } | 314 | } |
| 296 | 315 | ||
| 297 | static bool __cpuinit | 316 | static bool __cpuinit |
| @@ -313,7 +332,7 @@ do { \ | |||
| 313 | 332 | ||
| 314 | static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) | 333 | static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) |
| 315 | { | 334 | { |
| 316 | if (cpu_has(c, X86_FEATURE_TOPOEXT)) { | 335 | if (cpu_has_topoext) { |
| 317 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; | 336 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; |
| 318 | 337 | ||
| 319 | if (c->phys_proc_id == o->phys_proc_id && | 338 | if (c->phys_proc_id == o->phys_proc_id && |
| @@ -481,7 +500,7 @@ void __inquire_remote_apic(int apicid) | |||
| 481 | * won't ... remember to clear down the APIC, etc later. | 500 | * won't ... remember to clear down the APIC, etc later. |
| 482 | */ | 501 | */ |
| 483 | int __cpuinit | 502 | int __cpuinit |
| 484 | wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) | 503 | wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) |
| 485 | { | 504 | { |
| 486 | unsigned long send_status, accept_status = 0; | 505 | unsigned long send_status, accept_status = 0; |
| 487 | int maxlvt; | 506 | int maxlvt; |
| @@ -489,7 +508,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) | |||
| 489 | /* Target chip */ | 508 | /* Target chip */ |
| 490 | /* Boot on the stack */ | 509 | /* Boot on the stack */ |
| 491 | /* Kick the second */ | 510 | /* Kick the second */ |
| 492 | apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid); | 511 | apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid); |
| 493 | 512 | ||
| 494 | pr_debug("Waiting for send to finish...\n"); | 513 | pr_debug("Waiting for send to finish...\n"); |
| 495 | send_status = safe_apic_wait_icr_idle(); | 514 | send_status = safe_apic_wait_icr_idle(); |
| @@ -649,6 +668,63 @@ static void __cpuinit announce_cpu(int cpu, int apicid) | |||
| 649 | node, cpu, apicid); | 668 | node, cpu, apicid); |
| 650 | } | 669 | } |
| 651 | 670 | ||
| 671 | static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs) | ||
| 672 | { | ||
| 673 | int cpu; | ||
| 674 | |||
| 675 | cpu = smp_processor_id(); | ||
| 676 | if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0) | ||
| 677 | return NMI_HANDLED; | ||
| 678 | |||
| 679 | return NMI_DONE; | ||
| 680 | } | ||
| 681 | |||
| 682 | /* | ||
| 683 | * Wake up AP by INIT, INIT, STARTUP sequence. | ||
| 684 | * | ||
| 685 | * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS | ||
| 686 | * boot-strap code which is not a desired behavior for waking up BSP. To | ||
| 687 | * void the boot-strap code, wake up CPU0 by NMI instead. | ||
| 688 | * | ||
| 689 | * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined | ||
| 690 | * (i.e. physically hot removed and then hot added), NMI won't wake it up. | ||
| 691 | * We'll change this code in the future to wake up hard offlined CPU0 if | ||
| 692 | * real platform and request are available. | ||
| 693 | */ | ||
| 694 | static int __cpuinit | ||
| 695 | wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, | ||
| 696 | int *cpu0_nmi_registered) | ||
| 697 | { | ||
| 698 | int id; | ||
| 699 | int boot_error; | ||
| 700 | |||
| 701 | /* | ||
| 702 | * Wake up AP by INIT, INIT, STARTUP sequence. | ||
| 703 | */ | ||
| 704 | if (cpu) | ||
| 705 | return wakeup_secondary_cpu_via_init(apicid, start_ip); | ||
| 706 | |||
| 707 | /* | ||
| 708 | * Wake up BSP by nmi. | ||
| 709 | * | ||
| 710 | * Register a NMI handler to help wake up CPU0. | ||
| 711 | */ | ||
| 712 | boot_error = register_nmi_handler(NMI_LOCAL, | ||
| 713 | wakeup_cpu0_nmi, 0, "wake_cpu0"); | ||
| 714 | |||
| 715 | if (!boot_error) { | ||
| 716 | enable_start_cpu0 = 1; | ||
| 717 | *cpu0_nmi_registered = 1; | ||
| 718 | if (apic->dest_logical == APIC_DEST_LOGICAL) | ||
| 719 | id = cpu0_logical_apicid; | ||
| 720 | else | ||
| 721 | id = apicid; | ||
| 722 | boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip); | ||
| 723 | } | ||
| 724 | |||
| 725 | return boot_error; | ||
| 726 | } | ||
| 727 | |||
| 652 | /* | 728 | /* |
| 653 | * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad | 729 | * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad |
| 654 | * (ie clustered apic addressing mode), this is a LOGICAL apic ID. | 730 | * (ie clustered apic addressing mode), this is a LOGICAL apic ID. |
| @@ -664,6 +740,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) | |||
| 664 | 740 | ||
| 665 | unsigned long boot_error = 0; | 741 | unsigned long boot_error = 0; |
| 666 | int timeout; | 742 | int timeout; |
| 743 | int cpu0_nmi_registered = 0; | ||
| 667 | 744 | ||
| 668 | /* Just in case we booted with a single CPU. */ | 745 | /* Just in case we booted with a single CPU. */ |
| 669 | alternatives_enable_smp(); | 746 | alternatives_enable_smp(); |
| @@ -711,13 +788,16 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) | |||
| 711 | } | 788 | } |
| 712 | 789 | ||
| 713 | /* | 790 | /* |
| 714 | * Kick the secondary CPU. Use the method in the APIC driver | 791 | * Wake up a CPU in difference cases: |
| 715 | * if it's defined - or use an INIT boot APIC message otherwise: | 792 | * - Use the method in the APIC driver if it's defined |
| 793 | * Otherwise, | ||
| 794 | * - Use an INIT boot APIC message for APs or NMI for BSP. | ||
| 716 | */ | 795 | */ |
| 717 | if (apic->wakeup_secondary_cpu) | 796 | if (apic->wakeup_secondary_cpu) |
| 718 | boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); | 797 | boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); |
| 719 | else | 798 | else |
| 720 | boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip); | 799 | boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, |
| 800 | &cpu0_nmi_registered); | ||
| 721 | 801 | ||
| 722 | if (!boot_error) { | 802 | if (!boot_error) { |
| 723 | /* | 803 | /* |
| @@ -782,6 +862,13 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) | |||
| 782 | */ | 862 | */ |
| 783 | smpboot_restore_warm_reset_vector(); | 863 | smpboot_restore_warm_reset_vector(); |
| 784 | } | 864 | } |
| 865 | /* | ||
| 866 | * Clean up the nmi handler. Do this after the callin and callout sync | ||
| 867 | * to avoid impact of possible long unregister time. | ||
| 868 | */ | ||
| 869 | if (cpu0_nmi_registered) | ||
| 870 | unregister_nmi_handler(NMI_LOCAL, "wake_cpu0"); | ||
| 871 | |||
| 785 | return boot_error; | 872 | return boot_error; |
| 786 | } | 873 | } |
| 787 | 874 | ||
| @@ -795,7 +882,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) | |||
| 795 | 882 | ||
| 796 | pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); | 883 | pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); |
| 797 | 884 | ||
| 798 | if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || | 885 | if (apicid == BAD_APICID || |
| 799 | !physid_isset(apicid, phys_cpu_present_map) || | 886 | !physid_isset(apicid, phys_cpu_present_map) || |
| 800 | !apic->apic_id_valid(apicid)) { | 887 | !apic->apic_id_valid(apicid)) { |
| 801 | pr_err("%s: bad cpu %d\n", __func__, cpu); | 888 | pr_err("%s: bad cpu %d\n", __func__, cpu); |
| @@ -818,6 +905,9 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) | |||
| 818 | 905 | ||
| 819 | per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; | 906 | per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; |
| 820 | 907 | ||
| 908 | /* the FPU context is blank, nobody can own it */ | ||
| 909 | __cpu_disable_lazy_restore(cpu); | ||
| 910 | |||
| 821 | err = do_boot_cpu(apicid, cpu, tidle); | 911 | err = do_boot_cpu(apicid, cpu, tidle); |
| 822 | if (err) { | 912 | if (err) { |
| 823 | pr_debug("do_boot_cpu failed %d\n", err); | 913 | pr_debug("do_boot_cpu failed %d\n", err); |
| @@ -990,7 +1080,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) | |||
| 990 | /* | 1080 | /* |
| 991 | * Setup boot CPU information | 1081 | * Setup boot CPU information |
| 992 | */ | 1082 | */ |
| 993 | smp_store_cpu_info(0); /* Final full version of the data */ | 1083 | smp_store_boot_cpu_info(); /* Final full version of the data */ |
| 994 | cpumask_copy(cpu_callin_mask, cpumask_of(0)); | 1084 | cpumask_copy(cpu_callin_mask, cpumask_of(0)); |
| 995 | mb(); | 1085 | mb(); |
| 996 | 1086 | ||
| @@ -1026,6 +1116,11 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) | |||
| 1026 | */ | 1116 | */ |
| 1027 | setup_local_APIC(); | 1117 | setup_local_APIC(); |
| 1028 | 1118 | ||
| 1119 | if (x2apic_mode) | ||
| 1120 | cpu0_logical_apicid = apic_read(APIC_LDR); | ||
| 1121 | else | ||
| 1122 | cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); | ||
| 1123 | |||
| 1029 | /* | 1124 | /* |
| 1030 | * Enable IO APIC before setting up error vector | 1125 | * Enable IO APIC before setting up error vector |
| 1031 | */ | 1126 | */ |
| @@ -1214,19 +1309,6 @@ void cpu_disable_common(void) | |||
| 1214 | 1309 | ||
| 1215 | int native_cpu_disable(void) | 1310 | int native_cpu_disable(void) |
| 1216 | { | 1311 | { |
| 1217 | int cpu = smp_processor_id(); | ||
| 1218 | |||
| 1219 | /* | ||
| 1220 | * Perhaps use cpufreq to drop frequency, but that could go | ||
| 1221 | * into generic code. | ||
| 1222 | * | ||
| 1223 | * We won't take down the boot processor on i386 due to some | ||
| 1224 | * interrupts only being able to be serviced by the BSP. | ||
| 1225 | * Especially so if we're not using an IOAPIC -zwane | ||
| 1226 | */ | ||
| 1227 | if (cpu == 0) | ||
| 1228 | return -EBUSY; | ||
| 1229 | |||
| 1230 | clear_local_APIC(); | 1312 | clear_local_APIC(); |
| 1231 | 1313 | ||
| 1232 | cpu_disable_common(); | 1314 | cpu_disable_common(); |
| @@ -1266,6 +1348,14 @@ void play_dead_common(void) | |||
| 1266 | local_irq_disable(); | 1348 | local_irq_disable(); |
| 1267 | } | 1349 | } |
| 1268 | 1350 | ||
| 1351 | static bool wakeup_cpu0(void) | ||
| 1352 | { | ||
| 1353 | if (smp_processor_id() == 0 && enable_start_cpu0) | ||
| 1354 | return true; | ||
| 1355 | |||
| 1356 | return false; | ||
| 1357 | } | ||
| 1358 | |||
| 1269 | /* | 1359 | /* |
| 1270 | * We need to flush the caches before going to sleep, lest we have | 1360 | * We need to flush the caches before going to sleep, lest we have |
| 1271 | * dirty data in our caches when we come back up. | 1361 | * dirty data in our caches when we come back up. |
| @@ -1329,6 +1419,11 @@ static inline void mwait_play_dead(void) | |||
| 1329 | __monitor(mwait_ptr, 0, 0); | 1419 | __monitor(mwait_ptr, 0, 0); |
| 1330 | mb(); | 1420 | mb(); |
| 1331 | __mwait(eax, 0); | 1421 | __mwait(eax, 0); |
| 1422 | /* | ||
| 1423 | * If NMI wants to wake up CPU0, start CPU0. | ||
| 1424 | */ | ||
| 1425 | if (wakeup_cpu0()) | ||
| 1426 | start_cpu0(); | ||
| 1332 | } | 1427 | } |
| 1333 | } | 1428 | } |
| 1334 | 1429 | ||
| @@ -1339,6 +1434,11 @@ static inline void hlt_play_dead(void) | |||
| 1339 | 1434 | ||
| 1340 | while (1) { | 1435 | while (1) { |
| 1341 | native_halt(); | 1436 | native_halt(); |
| 1437 | /* | ||
| 1438 | * If NMI wants to wake up CPU0, start CPU0. | ||
| 1439 | */ | ||
| 1440 | if (wakeup_cpu0()) | ||
| 1441 | start_cpu0(); | ||
| 1342 | } | 1442 | } |
| 1343 | } | 1443 | } |
| 1344 | 1444 | ||
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index b4d3c3927dd8..97ef74b88e0f 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c | |||
| @@ -21,37 +21,23 @@ | |||
| 21 | 21 | ||
| 22 | /* | 22 | /* |
| 23 | * Align a virtual address to avoid aliasing in the I$ on AMD F15h. | 23 | * Align a virtual address to avoid aliasing in the I$ on AMD F15h. |
| 24 | * | ||
| 25 | * @flags denotes the allocation direction - bottomup or topdown - | ||
| 26 | * or vDSO; see call sites below. | ||
| 27 | */ | 24 | */ |
| 28 | unsigned long align_addr(unsigned long addr, struct file *filp, | 25 | static unsigned long get_align_mask(void) |
| 29 | enum align_flags flags) | ||
| 30 | { | 26 | { |
| 31 | unsigned long tmp_addr; | ||
| 32 | |||
| 33 | /* handle 32- and 64-bit case with a single conditional */ | 27 | /* handle 32- and 64-bit case with a single conditional */ |
| 34 | if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) | 28 | if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) |
| 35 | return addr; | 29 | return 0; |
| 36 | 30 | ||
| 37 | if (!(current->flags & PF_RANDOMIZE)) | 31 | if (!(current->flags & PF_RANDOMIZE)) |
| 38 | return addr; | 32 | return 0; |
| 39 | |||
| 40 | if (!((flags & ALIGN_VDSO) || filp)) | ||
| 41 | return addr; | ||
| 42 | |||
| 43 | tmp_addr = addr; | ||
| 44 | |||
| 45 | /* | ||
| 46 | * We need an address which is <= than the original | ||
| 47 | * one only when in topdown direction. | ||
| 48 | */ | ||
| 49 | if (!(flags & ALIGN_TOPDOWN)) | ||
| 50 | tmp_addr += va_align.mask; | ||
| 51 | 33 | ||
| 52 | tmp_addr &= ~va_align.mask; | 34 | return va_align.mask; |
| 35 | } | ||
| 53 | 36 | ||
| 54 | return tmp_addr; | 37 | unsigned long align_vdso_addr(unsigned long addr) |
| 38 | { | ||
| 39 | unsigned long align_mask = get_align_mask(); | ||
| 40 | return (addr + align_mask) & ~align_mask; | ||
| 55 | } | 41 | } |
| 56 | 42 | ||
| 57 | static int __init control_va_addr_alignment(char *str) | 43 | static int __init control_va_addr_alignment(char *str) |
| @@ -126,7 +112,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
| 126 | { | 112 | { |
| 127 | struct mm_struct *mm = current->mm; | 113 | struct mm_struct *mm = current->mm; |
| 128 | struct vm_area_struct *vma; | 114 | struct vm_area_struct *vma; |
| 129 | unsigned long start_addr; | 115 | struct vm_unmapped_area_info info; |
| 130 | unsigned long begin, end; | 116 | unsigned long begin, end; |
| 131 | 117 | ||
| 132 | if (flags & MAP_FIXED) | 118 | if (flags & MAP_FIXED) |
| @@ -144,50 +130,16 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
| 144 | (!vma || addr + len <= vma->vm_start)) | 130 | (!vma || addr + len <= vma->vm_start)) |
| 145 | return addr; | 131 | return addr; |
| 146 | } | 132 | } |
| 147 | if (((flags & MAP_32BIT) || test_thread_flag(TIF_ADDR32)) | ||
| 148 | && len <= mm->cached_hole_size) { | ||
| 149 | mm->cached_hole_size = 0; | ||
| 150 | mm->free_area_cache = begin; | ||
| 151 | } | ||
| 152 | addr = mm->free_area_cache; | ||
| 153 | if (addr < begin) | ||
| 154 | addr = begin; | ||
| 155 | start_addr = addr; | ||
| 156 | |||
| 157 | full_search: | ||
| 158 | |||
| 159 | addr = align_addr(addr, filp, 0); | ||
| 160 | |||
| 161 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | ||
| 162 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
| 163 | if (end - len < addr) { | ||
| 164 | /* | ||
| 165 | * Start a new search - just in case we missed | ||
| 166 | * some holes. | ||
| 167 | */ | ||
| 168 | if (start_addr != begin) { | ||
| 169 | start_addr = addr = begin; | ||
| 170 | mm->cached_hole_size = 0; | ||
| 171 | goto full_search; | ||
| 172 | } | ||
| 173 | return -ENOMEM; | ||
| 174 | } | ||
| 175 | if (!vma || addr + len <= vma->vm_start) { | ||
| 176 | /* | ||
| 177 | * Remember the place where we stopped the search: | ||
| 178 | */ | ||
| 179 | mm->free_area_cache = addr + len; | ||
| 180 | return addr; | ||
| 181 | } | ||
| 182 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
| 183 | mm->cached_hole_size = vma->vm_start - addr; | ||
| 184 | 133 | ||
| 185 | addr = vma->vm_end; | 134 | info.flags = 0; |
| 186 | addr = align_addr(addr, filp, 0); | 135 | info.length = len; |
| 187 | } | 136 | info.low_limit = begin; |
| 137 | info.high_limit = end; | ||
| 138 | info.align_mask = filp ? get_align_mask() : 0; | ||
| 139 | info.align_offset = pgoff << PAGE_SHIFT; | ||
| 140 | return vm_unmapped_area(&info); | ||
| 188 | } | 141 | } |
| 189 | 142 | ||
| 190 | |||
| 191 | unsigned long | 143 | unsigned long |
| 192 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | 144 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, |
| 193 | const unsigned long len, const unsigned long pgoff, | 145 | const unsigned long len, const unsigned long pgoff, |
| @@ -195,7 +147,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
| 195 | { | 147 | { |
| 196 | struct vm_area_struct *vma; | 148 | struct vm_area_struct *vma; |
| 197 | struct mm_struct *mm = current->mm; | 149 | struct mm_struct *mm = current->mm; |
| 198 | unsigned long addr = addr0, start_addr; | 150 | unsigned long addr = addr0; |
| 151 | struct vm_unmapped_area_info info; | ||
| 199 | 152 | ||
| 200 | /* requested length too big for entire address space */ | 153 | /* requested length too big for entire address space */ |
| 201 | if (len > TASK_SIZE) | 154 | if (len > TASK_SIZE) |
| @@ -217,51 +170,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
| 217 | return addr; | 170 | return addr; |
| 218 | } | 171 | } |
| 219 | 172 | ||
| 220 | /* check if free_area_cache is useful for us */ | 173 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
| 221 | if (len <= mm->cached_hole_size) { | 174 | info.length = len; |
| 222 | mm->cached_hole_size = 0; | 175 | info.low_limit = PAGE_SIZE; |
| 223 | mm->free_area_cache = mm->mmap_base; | 176 | info.high_limit = mm->mmap_base; |
| 224 | } | 177 | info.align_mask = filp ? get_align_mask() : 0; |
| 225 | 178 | info.align_offset = pgoff << PAGE_SHIFT; | |
| 226 | try_again: | 179 | addr = vm_unmapped_area(&info); |
| 227 | /* either no address requested or can't fit in requested address hole */ | 180 | if (!(addr & ~PAGE_MASK)) |
| 228 | start_addr = addr = mm->free_area_cache; | 181 | return addr; |
| 229 | 182 | VM_BUG_ON(addr != -ENOMEM); | |
| 230 | if (addr < len) | ||
| 231 | goto fail; | ||
| 232 | |||
| 233 | addr -= len; | ||
| 234 | do { | ||
| 235 | addr = align_addr(addr, filp, ALIGN_TOPDOWN); | ||
| 236 | |||
| 237 | /* | ||
| 238 | * Lookup failure means no vma is above this address, | ||
| 239 | * else if new region fits below vma->vm_start, | ||
| 240 | * return with success: | ||
| 241 | */ | ||
| 242 | vma = find_vma(mm, addr); | ||
| 243 | if (!vma || addr+len <= vma->vm_start) | ||
| 244 | /* remember the address as a hint for next time */ | ||
| 245 | return mm->free_area_cache = addr; | ||
| 246 | |||
| 247 | /* remember the largest hole we saw so far */ | ||
| 248 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
| 249 | mm->cached_hole_size = vma->vm_start - addr; | ||
| 250 | |||
| 251 | /* try just below the current vma->vm_start */ | ||
| 252 | addr = vma->vm_start-len; | ||
| 253 | } while (len < vma->vm_start); | ||
| 254 | |||
| 255 | fail: | ||
| 256 | /* | ||
| 257 | * if hint left us with no space for the requested | ||
| 258 | * mapping then try again: | ||
| 259 | */ | ||
| 260 | if (start_addr != mm->mmap_base) { | ||
| 261 | mm->free_area_cache = mm->mmap_base; | ||
| 262 | mm->cached_hole_size = 0; | ||
| 263 | goto try_again; | ||
| 264 | } | ||
| 265 | 183 | ||
| 266 | bottomup: | 184 | bottomup: |
| 267 | /* | 185 | /* |
| @@ -270,14 +188,5 @@ bottomup: | |||
| 270 | * can happen with large stack limits and large mmap() | 188 | * can happen with large stack limits and large mmap() |
| 271 | * allocations. | 189 | * allocations. |
| 272 | */ | 190 | */ |
| 273 | mm->cached_hole_size = ~0UL; | 191 | return arch_get_unmapped_area(filp, addr0, len, pgoff, flags); |
| 274 | mm->free_area_cache = TASK_UNMAPPED_BASE; | ||
| 275 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); | ||
| 276 | /* | ||
| 277 | * Restore the topdown base: | ||
| 278 | */ | ||
| 279 | mm->free_area_cache = mm->mmap_base; | ||
| 280 | mm->cached_hole_size = ~0UL; | ||
| 281 | |||
| 282 | return addr; | ||
| 283 | } | 192 | } |
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 76ee97709a00..6e60b5fe2244 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c | |||
| @@ -30,23 +30,110 @@ | |||
| 30 | #include <linux/mmzone.h> | 30 | #include <linux/mmzone.h> |
| 31 | #include <linux/init.h> | 31 | #include <linux/init.h> |
| 32 | #include <linux/smp.h> | 32 | #include <linux/smp.h> |
| 33 | #include <linux/irq.h> | ||
| 33 | #include <asm/cpu.h> | 34 | #include <asm/cpu.h> |
| 34 | 35 | ||
| 35 | static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); | 36 | static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); |
| 36 | 37 | ||
| 37 | #ifdef CONFIG_HOTPLUG_CPU | 38 | #ifdef CONFIG_HOTPLUG_CPU |
| 39 | |||
| 40 | #ifdef CONFIG_BOOTPARAM_HOTPLUG_CPU0 | ||
| 41 | static int cpu0_hotpluggable = 1; | ||
| 42 | #else | ||
| 43 | static int cpu0_hotpluggable; | ||
| 44 | static int __init enable_cpu0_hotplug(char *str) | ||
| 45 | { | ||
| 46 | cpu0_hotpluggable = 1; | ||
| 47 | return 1; | ||
| 48 | } | ||
| 49 | |||
| 50 | __setup("cpu0_hotplug", enable_cpu0_hotplug); | ||
| 51 | #endif | ||
| 52 | |||
| 53 | #ifdef CONFIG_DEBUG_HOTPLUG_CPU0 | ||
| 54 | /* | ||
| 55 | * This function offlines a CPU as early as possible and allows userspace to | ||
| 56 | * boot up without the CPU. The CPU can be onlined back by user after boot. | ||
| 57 | * | ||
| 58 | * This is only called for debugging CPU offline/online feature. | ||
| 59 | */ | ||
| 60 | int __ref _debug_hotplug_cpu(int cpu, int action) | ||
| 61 | { | ||
| 62 | struct device *dev = get_cpu_device(cpu); | ||
| 63 | int ret; | ||
| 64 | |||
| 65 | if (!cpu_is_hotpluggable(cpu)) | ||
| 66 | return -EINVAL; | ||
| 67 | |||
| 68 | cpu_hotplug_driver_lock(); | ||
| 69 | |||
| 70 | switch (action) { | ||
| 71 | case 0: | ||
| 72 | ret = cpu_down(cpu); | ||
| 73 | if (!ret) { | ||
| 74 | pr_info("CPU %u is now offline\n", cpu); | ||
| 75 | kobject_uevent(&dev->kobj, KOBJ_OFFLINE); | ||
| 76 | } else | ||
| 77 | pr_debug("Can't offline CPU%d.\n", cpu); | ||
| 78 | break; | ||
| 79 | case 1: | ||
| 80 | ret = cpu_up(cpu); | ||
| 81 | if (!ret) | ||
| 82 | kobject_uevent(&dev->kobj, KOBJ_ONLINE); | ||
| 83 | else | ||
| 84 | pr_debug("Can't online CPU%d.\n", cpu); | ||
| 85 | break; | ||
| 86 | default: | ||
| 87 | ret = -EINVAL; | ||
| 88 | } | ||
| 89 | |||
| 90 | cpu_hotplug_driver_unlock(); | ||
| 91 | |||
| 92 | return ret; | ||
| 93 | } | ||
| 94 | |||
| 95 | static int __init debug_hotplug_cpu(void) | ||
| 96 | { | ||
| 97 | _debug_hotplug_cpu(0, 0); | ||
| 98 | return 0; | ||
| 99 | } | ||
| 100 | |||
| 101 | late_initcall_sync(debug_hotplug_cpu); | ||
| 102 | #endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */ | ||
| 103 | |||
| 38 | int __ref arch_register_cpu(int num) | 104 | int __ref arch_register_cpu(int num) |
| 39 | { | 105 | { |
| 106 | struct cpuinfo_x86 *c = &cpu_data(num); | ||
| 107 | |||
| 108 | /* | ||
| 109 | * Currently CPU0 is only hotpluggable on Intel platforms. Other | ||
| 110 | * vendors can add hotplug support later. | ||
| 111 | */ | ||
| 112 | if (c->x86_vendor != X86_VENDOR_INTEL) | ||
| 113 | cpu0_hotpluggable = 0; | ||
| 114 | |||
| 40 | /* | 115 | /* |
| 41 | * CPU0 cannot be offlined due to several | 116 | * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate |
| 42 | * restrictions and assumptions in kernel. This basically | 117 | * depends on BSP. PIC interrupts depend on BSP. |
| 43 | * doesn't add a control file, one cannot attempt to offline | ||
| 44 | * BSP. | ||
| 45 | * | 118 | * |
| 46 | * Also certain PCI quirks require not to enable hotplug control | 119 | * If the BSP depencies are under control, one can tell kernel to |
| 47 | * for all CPU's. | 120 | * enable BSP hotplug. This basically adds a control file and |
| 121 | * one can attempt to offline BSP. | ||
| 48 | */ | 122 | */ |
| 49 | if (num) | 123 | if (num == 0 && cpu0_hotpluggable) { |
| 124 | unsigned int irq; | ||
| 125 | /* | ||
| 126 | * We won't take down the boot processor on i386 if some | ||
| 127 | * interrupts only are able to be serviced by the BSP in PIC. | ||
| 128 | */ | ||
| 129 | for_each_active_irq(irq) { | ||
| 130 | if (!IO_APIC_IRQ(irq) && irq_has_action(irq)) { | ||
| 131 | cpu0_hotpluggable = 0; | ||
| 132 | break; | ||
| 133 | } | ||
| 134 | } | ||
| 135 | } | ||
| 136 | if (num || cpu0_hotpluggable) | ||
| 50 | per_cpu(cpu_devices, num).cpu.hotpluggable = 1; | 137 | per_cpu(cpu_devices, num).cpu.hotpluggable = 1; |
| 51 | 138 | ||
| 52 | return register_cpu(&per_cpu(cpu_devices, num).cpu, num); | 139 | return register_cpu(&per_cpu(cpu_devices, num).cpu, num); |
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c new file mode 100644 index 000000000000..25b993729f9b --- /dev/null +++ b/arch/x86/kernel/trace_clock.c | |||
| @@ -0,0 +1,21 @@ | |||
| 1 | /* | ||
| 2 | * X86 trace clocks | ||
| 3 | */ | ||
| 4 | #include <asm/trace_clock.h> | ||
| 5 | #include <asm/barrier.h> | ||
| 6 | #include <asm/msr.h> | ||
| 7 | |||
| 8 | /* | ||
| 9 | * trace_clock_x86_tsc(): A clock that is just the cycle counter. | ||
| 10 | * | ||
| 11 | * Unlike the other clocks, this is not in nanoseconds. | ||
| 12 | */ | ||
| 13 | u64 notrace trace_clock_x86_tsc(void) | ||
| 14 | { | ||
| 15 | u64 ret; | ||
| 16 | |||
| 17 | rdtsc_barrier(); | ||
| 18 | rdtscll(ret); | ||
| 19 | |||
| 20 | return ret; | ||
| 21 | } | ||
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8276dc6794cc..ecffca11f4e9 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
| @@ -55,7 +55,7 @@ | |||
| 55 | #include <asm/i387.h> | 55 | #include <asm/i387.h> |
| 56 | #include <asm/fpu-internal.h> | 56 | #include <asm/fpu-internal.h> |
| 57 | #include <asm/mce.h> | 57 | #include <asm/mce.h> |
| 58 | #include <asm/rcu.h> | 58 | #include <asm/context_tracking.h> |
| 59 | 59 | ||
| 60 | #include <asm/mach_traps.h> | 60 | #include <asm/mach_traps.h> |
| 61 | 61 | ||
| @@ -69,9 +69,6 @@ | |||
| 69 | 69 | ||
| 70 | asmlinkage int system_call(void); | 70 | asmlinkage int system_call(void); |
| 71 | 71 | ||
| 72 | /* Do we ignore FPU interrupts ? */ | ||
| 73 | char ignore_fpu_irq; | ||
| 74 | |||
| 75 | /* | 72 | /* |
| 76 | * The IDT has to be page-aligned to simplify the Pentium | 73 | * The IDT has to be page-aligned to simplify the Pentium |
| 77 | * F0 0F bug workaround. | 74 | * F0 0F bug workaround. |
| @@ -564,9 +561,6 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) | |||
| 564 | 561 | ||
| 565 | dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) | 562 | dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) |
| 566 | { | 563 | { |
| 567 | #ifdef CONFIG_X86_32 | ||
| 568 | ignore_fpu_irq = 1; | ||
| 569 | #endif | ||
| 570 | exception_enter(regs); | 564 | exception_enter(regs); |
| 571 | math_error(regs, error_code, X86_TRAP_MF); | 565 | math_error(regs, error_code, X86_TRAP_MF); |
| 572 | exception_exit(regs); | 566 | exception_exit(regs); |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cfa5d4f7ca56..06ccb5073a3f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
| @@ -77,6 +77,12 @@ unsigned long long | |||
| 77 | sched_clock(void) __attribute__((alias("native_sched_clock"))); | 77 | sched_clock(void) __attribute__((alias("native_sched_clock"))); |
| 78 | #endif | 78 | #endif |
| 79 | 79 | ||
| 80 | unsigned long long native_read_tsc(void) | ||
| 81 | { | ||
| 82 | return __native_read_tsc(); | ||
| 83 | } | ||
| 84 | EXPORT_SYMBOL(native_read_tsc); | ||
| 85 | |||
| 80 | int check_tsc_unstable(void) | 86 | int check_tsc_unstable(void) |
| 81 | { | 87 | { |
| 82 | return tsc_unstable; | 88 | return tsc_unstable; |
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index aafa5557b396..c71025b67462 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c | |||
| @@ -478,6 +478,11 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) | |||
| 478 | regs->ip = current->utask->xol_vaddr; | 478 | regs->ip = current->utask->xol_vaddr; |
| 479 | pre_xol_rip_insn(auprobe, regs, autask); | 479 | pre_xol_rip_insn(auprobe, regs, autask); |
| 480 | 480 | ||
| 481 | autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); | ||
| 482 | regs->flags |= X86_EFLAGS_TF; | ||
| 483 | if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) | ||
| 484 | set_task_blockstep(current, false); | ||
| 485 | |||
| 481 | return 0; | 486 | return 0; |
| 482 | } | 487 | } |
| 483 | 488 | ||
| @@ -603,6 +608,16 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) | |||
| 603 | if (auprobe->fixups & UPROBE_FIX_CALL) | 608 | if (auprobe->fixups & UPROBE_FIX_CALL) |
| 604 | result = adjust_ret_addr(regs->sp, correction); | 609 | result = adjust_ret_addr(regs->sp, correction); |
| 605 | 610 | ||
| 611 | /* | ||
| 612 | * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP | ||
| 613 | * so we can get an extra SIGTRAP if we do not clear TF. We need | ||
| 614 | * to examine the opcode to make it right. | ||
| 615 | */ | ||
| 616 | if (utask->autask.saved_tf) | ||
| 617 | send_sig(SIGTRAP, current, 0); | ||
| 618 | else if (!(auprobe->fixups & UPROBE_FIX_SETF)) | ||
| 619 | regs->flags &= ~X86_EFLAGS_TF; | ||
| 620 | |||
| 606 | return result; | 621 | return result; |
| 607 | } | 622 | } |
| 608 | 623 | ||
| @@ -647,6 +662,10 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) | |||
| 647 | current->thread.trap_nr = utask->autask.saved_trap_nr; | 662 | current->thread.trap_nr = utask->autask.saved_trap_nr; |
| 648 | handle_riprel_post_xol(auprobe, regs, NULL); | 663 | handle_riprel_post_xol(auprobe, regs, NULL); |
| 649 | instruction_pointer_set(regs, utask->vaddr); | 664 | instruction_pointer_set(regs, utask->vaddr); |
| 665 | |||
| 666 | /* clear TF if it was set by us in arch_uprobe_pre_xol() */ | ||
| 667 | if (!utask->autask.saved_tf) | ||
| 668 | regs->flags &= ~X86_EFLAGS_TF; | ||
| 650 | } | 669 | } |
| 651 | 670 | ||
| 652 | /* | 671 | /* |
| @@ -676,38 +695,3 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) | |||
| 676 | send_sig(SIGTRAP, current, 0); | 695 | send_sig(SIGTRAP, current, 0); |
| 677 | return ret; | 696 | return ret; |
| 678 | } | 697 | } |
| 679 | |||
| 680 | void arch_uprobe_enable_step(struct arch_uprobe *auprobe) | ||
| 681 | { | ||
| 682 | struct task_struct *task = current; | ||
| 683 | struct arch_uprobe_task *autask = &task->utask->autask; | ||
| 684 | struct pt_regs *regs = task_pt_regs(task); | ||
| 685 | |||
| 686 | autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); | ||
| 687 | |||
| 688 | regs->flags |= X86_EFLAGS_TF; | ||
| 689 | if (test_tsk_thread_flag(task, TIF_BLOCKSTEP)) | ||
| 690 | set_task_blockstep(task, false); | ||
| 691 | } | ||
| 692 | |||
| 693 | void arch_uprobe_disable_step(struct arch_uprobe *auprobe) | ||
| 694 | { | ||
| 695 | struct task_struct *task = current; | ||
| 696 | struct arch_uprobe_task *autask = &task->utask->autask; | ||
| 697 | bool trapped = (task->utask->state == UTASK_SSTEP_TRAPPED); | ||
| 698 | struct pt_regs *regs = task_pt_regs(task); | ||
| 699 | /* | ||
| 700 | * The state of TIF_BLOCKSTEP was not saved so we can get an extra | ||
| 701 | * SIGTRAP if we do not clear TF. We need to examine the opcode to | ||
| 702 | * make it right. | ||
| 703 | */ | ||
| 704 | if (unlikely(trapped)) { | ||
| 705 | if (!autask->saved_tf) | ||
| 706 | regs->flags &= ~X86_EFLAGS_TF; | ||
| 707 | } else { | ||
| 708 | if (autask->saved_tf) | ||
| 709 | send_sig(SIGTRAP, task, 0); | ||
| 710 | else if (!(auprobe->fixups & UPROBE_FIX_SETF)) | ||
| 711 | regs->flags &= ~X86_EFLAGS_TF; | ||
| 712 | } | ||
| 713 | } | ||
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 5c9687b1bde6..1dfe69cc78a8 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
| @@ -182,7 +182,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) | |||
| 182 | if (pud_none_or_clear_bad(pud)) | 182 | if (pud_none_or_clear_bad(pud)) |
| 183 | goto out; | 183 | goto out; |
| 184 | pmd = pmd_offset(pud, 0xA0000); | 184 | pmd = pmd_offset(pud, 0xA0000); |
| 185 | split_huge_page_pmd(mm, pmd); | 185 | split_huge_page_pmd_mm(mm, 0xA0000, pmd); |
| 186 | if (pmd_none_or_clear_bad(pmd)) | 186 | if (pmd_none_or_clear_bad(pmd)) |
| 187 | goto out; | 187 | goto out; |
| 188 | pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); | 188 | pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); |
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 3a3e8c9e280d..9a907a67be8f 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c | |||
| @@ -145,19 +145,6 @@ static int addr_to_vsyscall_nr(unsigned long addr) | |||
| 145 | return nr; | 145 | return nr; |
| 146 | } | 146 | } |
| 147 | 147 | ||
| 148 | #ifdef CONFIG_SECCOMP | ||
| 149 | static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr) | ||
| 150 | { | ||
| 151 | if (!seccomp_mode(&tsk->seccomp)) | ||
| 152 | return 0; | ||
| 153 | task_pt_regs(tsk)->orig_ax = syscall_nr; | ||
| 154 | task_pt_regs(tsk)->ax = syscall_nr; | ||
| 155 | return __secure_computing(syscall_nr); | ||
| 156 | } | ||
| 157 | #else | ||
| 158 | #define vsyscall_seccomp(_tsk, _nr) 0 | ||
| 159 | #endif | ||
| 160 | |||
| 161 | static bool write_ok_or_segv(unsigned long ptr, size_t size) | 148 | static bool write_ok_or_segv(unsigned long ptr, size_t size) |
| 162 | { | 149 | { |
| 163 | /* | 150 | /* |
| @@ -190,10 +177,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) | |||
| 190 | { | 177 | { |
| 191 | struct task_struct *tsk; | 178 | struct task_struct *tsk; |
| 192 | unsigned long caller; | 179 | unsigned long caller; |
| 193 | int vsyscall_nr; | 180 | int vsyscall_nr, syscall_nr, tmp; |
| 194 | int prev_sig_on_uaccess_error; | 181 | int prev_sig_on_uaccess_error; |
| 195 | long ret; | 182 | long ret; |
| 196 | int skip; | ||
| 197 | 183 | ||
| 198 | /* | 184 | /* |
| 199 | * No point in checking CS -- the only way to get here is a user mode | 185 | * No point in checking CS -- the only way to get here is a user mode |
| @@ -225,56 +211,84 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) | |||
| 225 | } | 211 | } |
| 226 | 212 | ||
| 227 | tsk = current; | 213 | tsk = current; |
| 228 | /* | ||
| 229 | * With a real vsyscall, page faults cause SIGSEGV. We want to | ||
| 230 | * preserve that behavior to make writing exploits harder. | ||
| 231 | */ | ||
| 232 | prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; | ||
| 233 | current_thread_info()->sig_on_uaccess_error = 1; | ||
| 234 | 214 | ||
| 235 | /* | 215 | /* |
| 216 | * Check for access_ok violations and find the syscall nr. | ||
| 217 | * | ||
| 236 | * NULL is a valid user pointer (in the access_ok sense) on 32-bit and | 218 | * NULL is a valid user pointer (in the access_ok sense) on 32-bit and |
| 237 | * 64-bit, so we don't need to special-case it here. For all the | 219 | * 64-bit, so we don't need to special-case it here. For all the |
| 238 | * vsyscalls, NULL means "don't write anything" not "write it at | 220 | * vsyscalls, NULL means "don't write anything" not "write it at |
| 239 | * address 0". | 221 | * address 0". |
| 240 | */ | 222 | */ |
| 241 | ret = -EFAULT; | ||
| 242 | skip = 0; | ||
| 243 | switch (vsyscall_nr) { | 223 | switch (vsyscall_nr) { |
| 244 | case 0: | 224 | case 0: |
| 245 | skip = vsyscall_seccomp(tsk, __NR_gettimeofday); | ||
| 246 | if (skip) | ||
| 247 | break; | ||
| 248 | |||
| 249 | if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || | 225 | if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || |
| 250 | !write_ok_or_segv(regs->si, sizeof(struct timezone))) | 226 | !write_ok_or_segv(regs->si, sizeof(struct timezone))) { |
| 251 | break; | 227 | ret = -EFAULT; |
| 228 | goto check_fault; | ||
| 229 | } | ||
| 230 | |||
| 231 | syscall_nr = __NR_gettimeofday; | ||
| 232 | break; | ||
| 233 | |||
| 234 | case 1: | ||
| 235 | if (!write_ok_or_segv(regs->di, sizeof(time_t))) { | ||
| 236 | ret = -EFAULT; | ||
| 237 | goto check_fault; | ||
| 238 | } | ||
| 239 | |||
| 240 | syscall_nr = __NR_time; | ||
| 241 | break; | ||
| 242 | |||
| 243 | case 2: | ||
| 244 | if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || | ||
| 245 | !write_ok_or_segv(regs->si, sizeof(unsigned))) { | ||
| 246 | ret = -EFAULT; | ||
| 247 | goto check_fault; | ||
| 248 | } | ||
| 249 | |||
| 250 | syscall_nr = __NR_getcpu; | ||
| 251 | break; | ||
| 252 | } | ||
| 253 | |||
| 254 | /* | ||
| 255 | * Handle seccomp. regs->ip must be the original value. | ||
| 256 | * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. | ||
| 257 | * | ||
| 258 | * We could optimize the seccomp disabled case, but performance | ||
| 259 | * here doesn't matter. | ||
| 260 | */ | ||
| 261 | regs->orig_ax = syscall_nr; | ||
| 262 | regs->ax = -ENOSYS; | ||
| 263 | tmp = secure_computing(syscall_nr); | ||
| 264 | if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { | ||
| 265 | warn_bad_vsyscall(KERN_DEBUG, regs, | ||
| 266 | "seccomp tried to change syscall nr or ip"); | ||
| 267 | do_exit(SIGSYS); | ||
| 268 | } | ||
| 269 | if (tmp) | ||
| 270 | goto do_ret; /* skip requested */ | ||
| 252 | 271 | ||
| 272 | /* | ||
| 273 | * With a real vsyscall, page faults cause SIGSEGV. We want to | ||
| 274 | * preserve that behavior to make writing exploits harder. | ||
| 275 | */ | ||
| 276 | prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; | ||
| 277 | current_thread_info()->sig_on_uaccess_error = 1; | ||
| 278 | |||
| 279 | ret = -EFAULT; | ||
| 280 | switch (vsyscall_nr) { | ||
| 281 | case 0: | ||
| 253 | ret = sys_gettimeofday( | 282 | ret = sys_gettimeofday( |
| 254 | (struct timeval __user *)regs->di, | 283 | (struct timeval __user *)regs->di, |
| 255 | (struct timezone __user *)regs->si); | 284 | (struct timezone __user *)regs->si); |
| 256 | break; | 285 | break; |
| 257 | 286 | ||
| 258 | case 1: | 287 | case 1: |
| 259 | skip = vsyscall_seccomp(tsk, __NR_time); | ||
| 260 | if (skip) | ||
| 261 | break; | ||
| 262 | |||
| 263 | if (!write_ok_or_segv(regs->di, sizeof(time_t))) | ||
| 264 | break; | ||
| 265 | |||
| 266 | ret = sys_time((time_t __user *)regs->di); | 288 | ret = sys_time((time_t __user *)regs->di); |
| 267 | break; | 289 | break; |
| 268 | 290 | ||
| 269 | case 2: | 291 | case 2: |
| 270 | skip = vsyscall_seccomp(tsk, __NR_getcpu); | ||
| 271 | if (skip) | ||
| 272 | break; | ||
| 273 | |||
| 274 | if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || | ||
| 275 | !write_ok_or_segv(regs->si, sizeof(unsigned))) | ||
| 276 | break; | ||
| 277 | |||
| 278 | ret = sys_getcpu((unsigned __user *)regs->di, | 292 | ret = sys_getcpu((unsigned __user *)regs->di, |
| 279 | (unsigned __user *)regs->si, | 293 | (unsigned __user *)regs->si, |
| 280 | NULL); | 294 | NULL); |
| @@ -283,12 +297,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) | |||
| 283 | 297 | ||
| 284 | current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; | 298 | current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; |
| 285 | 299 | ||
| 286 | if (skip) { | 300 | check_fault: |
| 287 | if ((long)regs->ax <= 0L) /* seccomp errno emulation */ | ||
| 288 | goto do_ret; | ||
| 289 | goto done; /* seccomp trace/trap */ | ||
| 290 | } | ||
| 291 | |||
| 292 | if (ret == -EFAULT) { | 301 | if (ret == -EFAULT) { |
| 293 | /* Bad news -- userspace fed a bad pointer to a vsyscall. */ | 302 | /* Bad news -- userspace fed a bad pointer to a vsyscall. */ |
| 294 | warn_bad_vsyscall(KERN_INFO, regs, | 303 | warn_bad_vsyscall(KERN_INFO, regs, |
| @@ -311,7 +320,6 @@ do_ret: | |||
| 311 | /* Emulate a ret instruction. */ | 320 | /* Emulate a ret instruction. */ |
| 312 | regs->ip = caller; | 321 | regs->ip = caller; |
| 313 | regs->sp += 8; | 322 | regs->sp += 8; |
| 314 | done: | ||
| 315 | return true; | 323 | return true; |
| 316 | 324 | ||
| 317 | sigsegv: | 325 | sigsegv: |
