aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@linux.intel.com>2013-01-25 19:31:21 -0500
committerH. Peter Anvin <hpa@linux.intel.com>2013-01-25 19:31:21 -0500
commit7b5c4a65cc27f017c170b025f8d6d75dabb11c6f (patch)
tree05deacbc66a9f5c27147a6ea975211ae82281044 /arch/x86/kernel
parent3596f5bb0a6afd01a784bfe120f420edbbf82861 (diff)
parent949db153b6466c6f7cad5a427ecea94985927311 (diff)
Merge tag 'v3.8-rc5' into x86/mm
The __pa() fixup series that follows touches KVM code that is not present in the existing branch based on v3.7-rc5, so merge in the current upstream from Linus. Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c12
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/apic/apic.c73
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c35
-rw-r--r--arch/x86/kernel/cpu/amd.c26
-rw-r--r--arch/x86/kernel/cpu/bugs.c41
-rw-r--r--arch/x86/kernel/cpu/common.c14
-rw-r--r--arch/x86/kernel/cpu/intel.c4
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c75
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c209
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c39
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c11
-rw-r--r--arch/x86/kernel/cpu/perf_event.c127
-rw-r--r--arch/x86/kernel/cpu/perf_event.h5
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c2
-rw-r--r--arch/x86/kernel/cpu/proc.c7
-rw-r--r--arch/x86/kernel/crash.c32
-rw-r--r--arch/x86/kernel/entry_32.S20
-rw-r--r--arch/x86/kernel/entry_64.S82
-rw-r--r--arch/x86/kernel/head_32.S22
-rw-r--r--arch/x86/kernel/head_64.S16
-rw-r--r--arch/x86/kernel/hpet.c4
-rw-r--r--arch/x86/kernel/i387.c6
-rw-r--r--arch/x86/kernel/irqinit.c40
-rw-r--r--arch/x86/kernel/kvm.c30
-rw-r--r--arch/x86/kernel/kvmclock.c88
-rw-r--r--arch/x86/kernel/microcode_amd.c8
-rw-r--r--arch/x86/kernel/pci-dma.c2
-rw-r--r--arch/x86/kernel/process.c69
-rw-r--r--arch/x86/kernel/process_32.c12
-rw-r--r--arch/x86/kernel/process_64.c10
-rw-r--r--arch/x86/kernel/ptrace.c42
-rw-r--r--arch/x86/kernel/pvclock.c143
-rw-r--r--arch/x86/kernel/quirks.c4
-rw-r--r--arch/x86/kernel/rtc.c6
-rw-r--r--arch/x86/kernel/setup.c88
-rw-r--r--arch/x86/kernel/signal.c34
-rw-r--r--arch/x86/kernel/smpboot.c156
-rw-r--r--arch/x86/kernel/step.c9
-rw-r--r--arch/x86/kernel/sys_x86_64.c151
-rw-r--r--arch/x86/kernel/topology.c101
-rw-r--r--arch/x86/kernel/trace_clock.c21
-rw-r--r--arch/x86/kernel/traps.c8
-rw-r--r--arch/x86/kernel/tsc.c6
-rw-r--r--arch/x86/kernel/uprobes.c54
-rw-r--r--arch/x86/kernel/vm86_32.c2
-rw-r--r--arch/x86/kernel/vsyscall_64.c110
55 files changed, 1297 insertions, 797 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 91ce48f05f9f..34e923a53762 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -9,7 +9,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
9ifdef CONFIG_FUNCTION_TRACER 9ifdef CONFIG_FUNCTION_TRACER
10# Do not profile debug and lowlevel utilities 10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg 11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg 12CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
14CFLAGS_REMOVE_pvclock.o = -pg 13CFLAGS_REMOVE_pvclock.o = -pg
15CFLAGS_REMOVE_kvmclock.o = -pg 14CFLAGS_REMOVE_kvmclock.o = -pg
@@ -62,6 +61,7 @@ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
62obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 61obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
63obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o 62obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
64obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o 63obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
64obj-$(CONFIG_X86_TSC) += trace_clock.o
65obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 65obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
66obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 66obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
67obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 67obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index e651f7a589ac..bacf4b0d91f4 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -574,6 +574,12 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
574 574
575 return irq; 575 return irq;
576} 576}
577EXPORT_SYMBOL_GPL(acpi_register_gsi);
578
579void acpi_unregister_gsi(u32 gsi)
580{
581}
582EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
577 583
578void __init acpi_set_irq_model_pic(void) 584void __init acpi_set_irq_model_pic(void)
579{ 585{
@@ -1700,3 +1706,9 @@ int __acpi_release_global_lock(unsigned int *lock)
1700 } while (unlikely (val != old)); 1706 } while (unlikely (val != old));
1701 return old & 0x1; 1707 return old & 0x1;
1702} 1708}
1709
1710void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size)
1711{
1712 e820_add_region(addr, size, E820_ACPI);
1713 update_e820();
1714}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index f146a3c10814..0532f5d6e4ef 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,6 +101,8 @@ static int __init acpi_sleep_setup(char *str)
101#endif 101#endif
102 if (strncmp(str, "nonvs", 5) == 0) 102 if (strncmp(str, "nonvs", 5) == 0)
103 acpi_nvs_nosave(); 103 acpi_nvs_nosave();
104 if (strncmp(str, "nonvs_s3", 8) == 0)
105 acpi_nvs_nosave_s3();
104 if (strncmp(str, "old_ordering", 12) == 0) 106 if (strncmp(str, "old_ordering", 12) == 0)
105 acpi_old_suspend_ordering(); 107 acpi_old_suspend_ordering();
106 str = strchr(str, ','); 108 str = strchr(str, ',');
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b17416e72fbd..b994cc84aa7e 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -90,21 +90,6 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
90 */ 90 */
91DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); 91DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID);
92 92
93/*
94 * Knob to control our willingness to enable the local APIC.
95 *
96 * +1=force-enable
97 */
98static int force_enable_local_apic __initdata;
99/*
100 * APIC command line parameters
101 */
102static int __init parse_lapic(char *arg)
103{
104 force_enable_local_apic = 1;
105 return 0;
106}
107early_param("lapic", parse_lapic);
108/* Local APIC was disabled by the BIOS and enabled by the kernel */ 93/* Local APIC was disabled by the BIOS and enabled by the kernel */
109static int enabled_via_apicbase; 94static int enabled_via_apicbase;
110 95
@@ -133,6 +118,25 @@ static inline void imcr_apic_to_pic(void)
133} 118}
134#endif 119#endif
135 120
121/*
122 * Knob to control our willingness to enable the local APIC.
123 *
124 * +1=force-enable
125 */
126static int force_enable_local_apic __initdata;
127/*
128 * APIC command line parameters
129 */
130static int __init parse_lapic(char *arg)
131{
132 if (config_enabled(CONFIG_X86_32) && !arg)
133 force_enable_local_apic = 1;
134 else if (!strncmp(arg, "notscdeadline", 13))
135 setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
136 return 0;
137}
138early_param("lapic", parse_lapic);
139
136#ifdef CONFIG_X86_64 140#ifdef CONFIG_X86_64
137static int apic_calibrate_pmtmr __initdata; 141static int apic_calibrate_pmtmr __initdata;
138static __init int setup_apicpmtimer(char *s) 142static __init int setup_apicpmtimer(char *s)
@@ -315,6 +319,7 @@ int lapic_get_maxlvt(void)
315 319
316/* Clock divisor */ 320/* Clock divisor */
317#define APIC_DIVISOR 16 321#define APIC_DIVISOR 16
322#define TSC_DIVISOR 32
318 323
319/* 324/*
320 * This function sets up the local APIC timer, with a timeout of 325 * This function sets up the local APIC timer, with a timeout of
@@ -333,6 +338,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
333 lvtt_value = LOCAL_TIMER_VECTOR; 338 lvtt_value = LOCAL_TIMER_VECTOR;
334 if (!oneshot) 339 if (!oneshot)
335 lvtt_value |= APIC_LVT_TIMER_PERIODIC; 340 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
341 else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
342 lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE;
343
336 if (!lapic_is_integrated()) 344 if (!lapic_is_integrated())
337 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); 345 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
338 346
@@ -341,6 +349,11 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
341 349
342 apic_write(APIC_LVTT, lvtt_value); 350 apic_write(APIC_LVTT, lvtt_value);
343 351
352 if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) {
353 printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
354 return;
355 }
356
344 /* 357 /*
345 * Divide PICLK by 16 358 * Divide PICLK by 16
346 */ 359 */
@@ -453,6 +466,16 @@ static int lapic_next_event(unsigned long delta,
453 return 0; 466 return 0;
454} 467}
455 468
469static int lapic_next_deadline(unsigned long delta,
470 struct clock_event_device *evt)
471{
472 u64 tsc;
473
474 rdtscll(tsc);
475 wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
476 return 0;
477}
478
456/* 479/*
457 * Setup the lapic timer in periodic or oneshot mode 480 * Setup the lapic timer in periodic or oneshot mode
458 */ 481 */
@@ -533,7 +556,15 @@ static void __cpuinit setup_APIC_timer(void)
533 memcpy(levt, &lapic_clockevent, sizeof(*levt)); 556 memcpy(levt, &lapic_clockevent, sizeof(*levt));
534 levt->cpumask = cpumask_of(smp_processor_id()); 557 levt->cpumask = cpumask_of(smp_processor_id());
535 558
536 clockevents_register_device(levt); 559 if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
560 levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC |
561 CLOCK_EVT_FEAT_DUMMY);
562 levt->set_next_event = lapic_next_deadline;
563 clockevents_config_and_register(levt,
564 (tsc_khz / TSC_DIVISOR) * 1000,
565 0xF, ~0UL);
566 } else
567 clockevents_register_device(levt);
537} 568}
538 569
539/* 570/*
@@ -661,7 +692,9 @@ static int __init calibrate_APIC_clock(void)
661 * in the clockevent structure and return. 692 * in the clockevent structure and return.
662 */ 693 */
663 694
664 if (lapic_timer_frequency) { 695 if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
696 return 0;
697 } else if (lapic_timer_frequency) {
665 apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", 698 apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n",
666 lapic_timer_frequency); 699 lapic_timer_frequency);
667 lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, 700 lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR,
@@ -674,6 +707,9 @@ static int __init calibrate_APIC_clock(void)
674 return 0; 707 return 0;
675 } 708 }
676 709
710 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
711 "calibrating APIC timer ...\n");
712
677 local_irq_disable(); 713 local_irq_disable();
678 714
679 /* Replace the global interrupt handler */ 715 /* Replace the global interrupt handler */
@@ -811,9 +847,6 @@ void __init setup_boot_APIC_clock(void)
811 return; 847 return;
812 } 848 }
813 849
814 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
815 "calibrating APIC timer ...\n");
816
817 if (calibrate_APIC_clock()) { 850 if (calibrate_APIC_clock()) {
818 /* No broadcast on UP ! */ 851 /* No broadcast on UP ! */
819 if (num_possible_cpus() > 1) 852 if (num_possible_cpus() > 1)
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index ae9196f31261..9a9110918ca7 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -22,6 +22,7 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/delay.h> 23#include <linux/delay.h>
24 24
25#include <asm/numachip/numachip.h>
25#include <asm/numachip/numachip_csr.h> 26#include <asm/numachip/numachip_csr.h>
26#include <asm/smp.h> 27#include <asm/smp.h>
27#include <asm/apic.h> 28#include <asm/apic.h>
@@ -180,6 +181,7 @@ static int __init numachip_system_init(void)
180 return 0; 181 return 0;
181 182
182 x86_cpuinit.fixup_cpu_id = fixup_cpu_id; 183 x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
184 x86_init.pci.arch_init = pci_numachip_init;
183 185
184 map_csrs(); 186 map_csrs();
185 187
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1817fa911024..b739d398bb29 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -234,11 +234,11 @@ int __init arch_early_irq_init(void)
234 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); 234 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
235 /* 235 /*
236 * For legacy IRQ's, start with assigning irq0 to irq15 to 236 * For legacy IRQ's, start with assigning irq0 to irq15 to
237 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. 237 * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
238 */ 238 */
239 if (i < legacy_pic->nr_legacy_irqs) { 239 if (i < legacy_pic->nr_legacy_irqs) {
240 cfg[i].vector = IRQ0_VECTOR + i; 240 cfg[i].vector = IRQ0_VECTOR + i;
241 cpumask_set_cpu(0, cfg[i].domain); 241 cpumask_setall(cfg[i].domain);
242 } 242 }
243 } 243 }
244 244
@@ -1141,7 +1141,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1141 * allocation for the members that are not used anymore. 1141 * allocation for the members that are not used anymore.
1142 */ 1142 */
1143 cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); 1143 cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
1144 cfg->move_in_progress = 1; 1144 cfg->move_in_progress =
1145 cpumask_intersects(cfg->old_domain, cpu_online_mask);
1145 cpumask_and(cfg->domain, cfg->domain, tmp_mask); 1146 cpumask_and(cfg->domain, cfg->domain, tmp_mask);
1146 break; 1147 break;
1147 } 1148 }
@@ -1172,8 +1173,9 @@ next:
1172 current_vector = vector; 1173 current_vector = vector;
1173 current_offset = offset; 1174 current_offset = offset;
1174 if (cfg->vector) { 1175 if (cfg->vector) {
1175 cfg->move_in_progress = 1;
1176 cpumask_copy(cfg->old_domain, cfg->domain); 1176 cpumask_copy(cfg->old_domain, cfg->domain);
1177 cfg->move_in_progress =
1178 cpumask_intersects(cfg->old_domain, cpu_online_mask);
1177 } 1179 }
1178 for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) 1180 for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
1179 per_cpu(vector_irq, new_cpu)[vector] = irq; 1181 per_cpu(vector_irq, new_cpu)[vector] = irq;
@@ -1241,12 +1243,6 @@ void __setup_vector_irq(int cpu)
1241 cfg = irq_get_chip_data(irq); 1243 cfg = irq_get_chip_data(irq);
1242 if (!cfg) 1244 if (!cfg)
1243 continue; 1245 continue;
1244 /*
1245 * If it is a legacy IRQ handled by the legacy PIC, this cpu
1246 * will be part of the irq_cfg's domain.
1247 */
1248 if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq))
1249 cpumask_set_cpu(cpu, cfg->domain);
1250 1246
1251 if (!cpumask_test_cpu(cpu, cfg->domain)) 1247 if (!cpumask_test_cpu(cpu, cfg->domain))
1252 continue; 1248 continue;
@@ -1356,16 +1352,6 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
1356 if (!IO_APIC_IRQ(irq)) 1352 if (!IO_APIC_IRQ(irq))
1357 return; 1353 return;
1358 1354
1359 /*
1360 * For legacy irqs, cfg->domain starts with cpu 0. Now that IO-APIC
1361 * can handle this irq and the apic driver is finialized at this point,
1362 * update the cfg->domain.
1363 */
1364 if (irq < legacy_pic->nr_legacy_irqs &&
1365 cpumask_equal(cfg->domain, cpumask_of(0)))
1366 apic->vector_allocation_domain(0, cfg->domain,
1367 apic->target_cpus());
1368
1369 if (assign_irq_vector(irq, cfg, apic->target_cpus())) 1355 if (assign_irq_vector(irq, cfg, apic->target_cpus()))
1370 return; 1356 return;
1371 1357
@@ -2199,9 +2185,11 @@ static int ioapic_retrigger_irq(struct irq_data *data)
2199{ 2185{
2200 struct irq_cfg *cfg = data->chip_data; 2186 struct irq_cfg *cfg = data->chip_data;
2201 unsigned long flags; 2187 unsigned long flags;
2188 int cpu;
2202 2189
2203 raw_spin_lock_irqsave(&vector_lock, flags); 2190 raw_spin_lock_irqsave(&vector_lock, flags);
2204 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); 2191 cpu = cpumask_first_and(cfg->domain, cpu_online_mask);
2192 apic->send_IPI_mask(cpumask_of(cpu), cfg->vector);
2205 raw_spin_unlock_irqrestore(&vector_lock, flags); 2193 raw_spin_unlock_irqrestore(&vector_lock, flags);
2206 2194
2207 return 1; 2195 return 1;
@@ -3317,8 +3305,9 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3317 int ret; 3305 int ret;
3318 3306
3319 if (irq_remapping_enabled) { 3307 if (irq_remapping_enabled) {
3320 if (!setup_hpet_msi_remapped(irq, id)) 3308 ret = setup_hpet_msi_remapped(irq, id);
3321 return -1; 3309 if (ret)
3310 return ret;
3322 } 3311 }
3323 3312
3324 ret = msi_compose_msg(NULL, irq, &msg, id); 3313 ret = msi_compose_msg(NULL, irq, &msg, id);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f7e98a2c0d12..15239fffd6fe 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -304,7 +304,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
304 int cpu = smp_processor_id(); 304 int cpu = smp_processor_id();
305 305
306 /* get information required for multi-node processors */ 306 /* get information required for multi-node processors */
307 if (cpu_has(c, X86_FEATURE_TOPOEXT)) { 307 if (cpu_has_topoext) {
308 u32 eax, ebx, ecx, edx; 308 u32 eax, ebx, ecx, edx;
309 309
310 cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); 310 cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
@@ -631,6 +631,20 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
631 } 631 }
632 } 632 }
633 633
634 /*
635 * The way access filter has a performance penalty on some workloads.
636 * Disable it on the affected CPUs.
637 */
638 if ((c->x86 == 0x15) &&
639 (c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
640 u64 val;
641
642 if (!rdmsrl_safe(0xc0011021, &val) && !(val & 0x1E)) {
643 val |= 0x1E;
644 wrmsrl_safe(0xc0011021, val);
645 }
646 }
647
634 cpu_detect_cache_sizes(c); 648 cpu_detect_cache_sizes(c);
635 649
636 /* Multi core CPU? */ 650 /* Multi core CPU? */
@@ -643,12 +657,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
643 detect_ht(c); 657 detect_ht(c);
644#endif 658#endif
645 659
646 if (c->extended_cpuid_level >= 0x80000006) { 660 init_amd_cacheinfo(c);
647 if (cpuid_edx(0x80000006) & 0xf000)
648 num_cache_leaves = 4;
649 else
650 num_cache_leaves = 3;
651 }
652 661
653 if (c->x86 >= 0xf) 662 if (c->x86 >= 0xf)
654 set_cpu_cap(c, X86_FEATURE_K8); 663 set_cpu_cap(c, X86_FEATURE_K8);
@@ -739,9 +748,6 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c,
739 748
740static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) 749static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
741{ 750{
742 if (!cpu_has_invlpg)
743 return;
744
745 tlb_flushall_shift = 5; 751 tlb_flushall_shift = 5;
746 752
747 if (c->x86 <= 0x11) 753 if (c->x86 <= 0x11)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d0e910da16c5..92dfec986a48 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -107,53 +107,17 @@ static void __init check_hlt(void)
107} 107}
108 108
109/* 109/*
110 * Most 386 processors have a bug where a POPAD can lock the
111 * machine even from user space.
112 */
113
114static void __init check_popad(void)
115{
116#ifndef CONFIG_X86_POPAD_OK
117 int res, inp = (int) &res;
118
119 pr_info("Checking for popad bug... ");
120 __asm__ __volatile__(
121 "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
122 : "=&a" (res)
123 : "d" (inp)
124 : "ecx", "edi");
125 /*
126 * If this fails, it means that any user program may lock the
127 * CPU hard. Too bad.
128 */
129 if (res != 12345678)
130 pr_cont("Buggy\n");
131 else
132 pr_cont("OK\n");
133#endif
134}
135
136/*
137 * Check whether we are able to run this kernel safely on SMP. 110 * Check whether we are able to run this kernel safely on SMP.
138 * 111 *
139 * - In order to run on a i386, we need to be compiled for i386 112 * - i386 is no longer supported.
140 * (for due to lack of "invlpg" and working WP on a i386)
141 * - In order to run on anything without a TSC, we need to be 113 * - In order to run on anything without a TSC, we need to be
142 * compiled for a i486. 114 * compiled for a i486.
143 */ 115 */
144 116
145static void __init check_config(void) 117static void __init check_config(void)
146{ 118{
147/* 119 if (boot_cpu_data.x86 < 4)
148 * We'd better not be a i386 if we're configured to use some
149 * i486+ only features! (WP works in supervisor mode and the
150 * new "invlpg" and "bswap" instructions)
151 */
152#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \
153 defined(CONFIG_X86_BSWAP)
154 if (boot_cpu_data.x86 == 3)
155 panic("Kernel requires i486+ for 'invlpg' and other features"); 120 panic("Kernel requires i486+ for 'invlpg' and other features");
156#endif
157} 121}
158 122
159 123
@@ -166,7 +130,6 @@ void __init check_bugs(void)
166#endif 130#endif
167 check_config(); 131 check_config();
168 check_hlt(); 132 check_hlt();
169 check_popad();
170 init_utsname()->machine[1] = 133 init_utsname()->machine[1] =
171 '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); 134 '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
172 alternative_instructions(); 135 alternative_instructions();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7505f7b13e71..9c3ab43a6954 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1173,15 +1173,6 @@ DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1173DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); 1173DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
1174#endif 1174#endif
1175 1175
1176/* Make sure %fs and %gs are initialized properly in idle threads */
1177struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
1178{
1179 memset(regs, 0, sizeof(struct pt_regs));
1180 regs->fs = __KERNEL_PERCPU;
1181 regs->gs = __KERNEL_STACK_CANARY;
1182
1183 return regs;
1184}
1185#endif /* CONFIG_X86_64 */ 1176#endif /* CONFIG_X86_64 */
1186 1177
1187/* 1178/*
@@ -1237,7 +1228,7 @@ void __cpuinit cpu_init(void)
1237 oist = &per_cpu(orig_ist, cpu); 1228 oist = &per_cpu(orig_ist, cpu);
1238 1229
1239#ifdef CONFIG_NUMA 1230#ifdef CONFIG_NUMA
1240 if (cpu != 0 && this_cpu_read(numa_node) == 0 && 1231 if (this_cpu_read(numa_node) == 0 &&
1241 early_cpu_to_node(cpu) != NUMA_NO_NODE) 1232 early_cpu_to_node(cpu) != NUMA_NO_NODE)
1242 set_numa_node(early_cpu_to_node(cpu)); 1233 set_numa_node(early_cpu_to_node(cpu));
1243#endif 1234#endif
@@ -1269,8 +1260,7 @@ void __cpuinit cpu_init(void)
1269 barrier(); 1260 barrier();
1270 1261
1271 x86_configure_nx(); 1262 x86_configure_nx();
1272 if (cpu != 0) 1263 enable_x2apic();
1273 enable_x2apic();
1274 1264
1275 /* 1265 /*
1276 * set up and load the per-CPU TSS 1266 * set up and load the per-CPU TSS
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 2249e7e44521..fdfefa27b948 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -612,10 +612,6 @@ static void __cpuinit intel_tlb_lookup(const unsigned char desc)
612 612
613static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) 613static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
614{ 614{
615 if (!cpu_has_invlpg) {
616 tlb_flushall_shift = -1;
617 return;
618 }
619 switch ((c->x86 << 8) + c->x86_model) { 615 switch ((c->x86 << 8) + c->x86_model) {
620 case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ 616 case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
621 case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ 617 case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 93c5451bdd52..fe9edec6698a 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -538,7 +538,11 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
538 unsigned edx; 538 unsigned edx;
539 539
540 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 540 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
541 amd_cpuid4(index, &eax, &ebx, &ecx); 541 if (cpu_has_topoext)
542 cpuid_count(0x8000001d, index, &eax.full,
543 &ebx.full, &ecx.full, &edx);
544 else
545 amd_cpuid4(index, &eax, &ebx, &ecx);
542 amd_init_l3_cache(this_leaf, index); 546 amd_init_l3_cache(this_leaf, index);
543 } else { 547 } else {
544 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 548 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
@@ -557,21 +561,39 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
557 return 0; 561 return 0;
558} 562}
559 563
560static int __cpuinit find_num_cache_leaves(void) 564static int __cpuinit find_num_cache_leaves(struct cpuinfo_x86 *c)
561{ 565{
562 unsigned int eax, ebx, ecx, edx; 566 unsigned int eax, ebx, ecx, edx, op;
563 union _cpuid4_leaf_eax cache_eax; 567 union _cpuid4_leaf_eax cache_eax;
564 int i = -1; 568 int i = -1;
565 569
570 if (c->x86_vendor == X86_VENDOR_AMD)
571 op = 0x8000001d;
572 else
573 op = 4;
574
566 do { 575 do {
567 ++i; 576 ++i;
568 /* Do cpuid(4) loop to find out num_cache_leaves */ 577 /* Do cpuid(op) loop to find out num_cache_leaves */
569 cpuid_count(4, i, &eax, &ebx, &ecx, &edx); 578 cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
570 cache_eax.full = eax; 579 cache_eax.full = eax;
571 } while (cache_eax.split.type != CACHE_TYPE_NULL); 580 } while (cache_eax.split.type != CACHE_TYPE_NULL);
572 return i; 581 return i;
573} 582}
574 583
584void __cpuinit init_amd_cacheinfo(struct cpuinfo_x86 *c)
585{
586
587 if (cpu_has_topoext) {
588 num_cache_leaves = find_num_cache_leaves(c);
589 } else if (c->extended_cpuid_level >= 0x80000006) {
590 if (cpuid_edx(0x80000006) & 0xf000)
591 num_cache_leaves = 4;
592 else
593 num_cache_leaves = 3;
594 }
595}
596
575unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) 597unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
576{ 598{
577 /* Cache sizes */ 599 /* Cache sizes */
@@ -588,7 +610,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
588 610
589 if (is_initialized == 0) { 611 if (is_initialized == 0) {
590 /* Init num_cache_leaves from boot CPU */ 612 /* Init num_cache_leaves from boot CPU */
591 num_cache_leaves = find_num_cache_leaves(); 613 num_cache_leaves = find_num_cache_leaves(c);
592 is_initialized++; 614 is_initialized++;
593 } 615 }
594 616
@@ -728,37 +750,50 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
728static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) 750static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
729{ 751{
730 struct _cpuid4_info *this_leaf; 752 struct _cpuid4_info *this_leaf;
731 int ret, i, sibling; 753 int i, sibling;
732 struct cpuinfo_x86 *c = &cpu_data(cpu);
733 754
734 ret = 0; 755 if (cpu_has_topoext) {
735 if (index == 3) { 756 unsigned int apicid, nshared, first, last;
736 ret = 1; 757
737 for_each_cpu(i, cpu_llc_shared_mask(cpu)) { 758 if (!per_cpu(ici_cpuid4_info, cpu))
759 return 0;
760
761 this_leaf = CPUID4_INFO_IDX(cpu, index);
762 nshared = this_leaf->base.eax.split.num_threads_sharing + 1;
763 apicid = cpu_data(cpu).apicid;
764 first = apicid - (apicid % nshared);
765 last = first + nshared - 1;
766
767 for_each_online_cpu(i) {
768 apicid = cpu_data(i).apicid;
769 if ((apicid < first) || (apicid > last))
770 continue;
738 if (!per_cpu(ici_cpuid4_info, i)) 771 if (!per_cpu(ici_cpuid4_info, i))
739 continue; 772 continue;
740 this_leaf = CPUID4_INFO_IDX(i, index); 773 this_leaf = CPUID4_INFO_IDX(i, index);
741 for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { 774
742 if (!cpu_online(sibling)) 775 for_each_online_cpu(sibling) {
776 apicid = cpu_data(sibling).apicid;
777 if ((apicid < first) || (apicid > last))
743 continue; 778 continue;
744 set_bit(sibling, this_leaf->shared_cpu_map); 779 set_bit(sibling, this_leaf->shared_cpu_map);
745 } 780 }
746 } 781 }
747 } else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) { 782 } else if (index == 3) {
748 ret = 1; 783 for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
749 for_each_cpu(i, cpu_sibling_mask(cpu)) {
750 if (!per_cpu(ici_cpuid4_info, i)) 784 if (!per_cpu(ici_cpuid4_info, i))
751 continue; 785 continue;
752 this_leaf = CPUID4_INFO_IDX(i, index); 786 this_leaf = CPUID4_INFO_IDX(i, index);
753 for_each_cpu(sibling, cpu_sibling_mask(cpu)) { 787 for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
754 if (!cpu_online(sibling)) 788 if (!cpu_online(sibling))
755 continue; 789 continue;
756 set_bit(sibling, this_leaf->shared_cpu_map); 790 set_bit(sibling, this_leaf->shared_cpu_map);
757 } 791 }
758 } 792 }
759 } 793 } else
794 return 0;
760 795
761 return ret; 796 return 1;
762} 797}
763 798
764static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) 799static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 6a05c1d327a9..5b7d4fa5d3b7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -24,8 +24,6 @@ struct mce_bank {
24int mce_severity(struct mce *a, int tolerant, char **msg); 24int mce_severity(struct mce *a, int tolerant, char **msg);
25struct dentry *mce_get_debugfs_dir(void); 25struct dentry *mce_get_debugfs_dir(void);
26 26
27extern int mce_ser;
28
29extern struct mce_bank *mce_banks; 27extern struct mce_bank *mce_banks;
30 28
31#ifdef CONFIG_X86_MCE_INTEL 29#ifdef CONFIG_X86_MCE_INTEL
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 13017626f9a8..beb1f1689e52 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -193,9 +193,9 @@ int mce_severity(struct mce *m, int tolerant, char **msg)
193 continue; 193 continue;
194 if ((m->mcgstatus & s->mcgmask) != s->mcgres) 194 if ((m->mcgstatus & s->mcgmask) != s->mcgres)
195 continue; 195 continue;
196 if (s->ser == SER_REQUIRED && !mce_ser) 196 if (s->ser == SER_REQUIRED && !mca_cfg.ser)
197 continue; 197 continue;
198 if (s->ser == NO_SER && mce_ser) 198 if (s->ser == NO_SER && mca_cfg.ser)
199 continue; 199 continue;
200 if (s->context && ctx != s->context) 200 if (s->context && ctx != s->context)
201 continue; 201 continue;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 46cbf8689692..80dbda84f1c3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -58,34 +58,26 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
58#define CREATE_TRACE_POINTS 58#define CREATE_TRACE_POINTS
59#include <trace/events/mce.h> 59#include <trace/events/mce.h>
60 60
61int mce_disabled __read_mostly;
62
63#define SPINUNIT 100 /* 100ns */ 61#define SPINUNIT 100 /* 100ns */
64 62
65atomic_t mce_entry; 63atomic_t mce_entry;
66 64
67DEFINE_PER_CPU(unsigned, mce_exception_count); 65DEFINE_PER_CPU(unsigned, mce_exception_count);
68 66
69/* 67struct mce_bank *mce_banks __read_mostly;
70 * Tolerant levels: 68
71 * 0: always panic on uncorrected errors, log corrected errors 69struct mca_config mca_cfg __read_mostly = {
72 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 70 .bootlog = -1,
73 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 71 /*
74 * 3: never panic or SIGBUS, log all errors (for testing only) 72 * Tolerant levels:
75 */ 73 * 0: always panic on uncorrected errors, log corrected errors
76static int tolerant __read_mostly = 1; 74 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
77static int banks __read_mostly; 75 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
78static int rip_msr __read_mostly; 76 * 3: never panic or SIGBUS, log all errors (for testing only)
79static int mce_bootlog __read_mostly = -1; 77 */
80static int monarch_timeout __read_mostly = -1; 78 .tolerant = 1,
81static int mce_panic_timeout __read_mostly; 79 .monarch_timeout = -1
82static int mce_dont_log_ce __read_mostly; 80};
83int mce_cmci_disabled __read_mostly;
84int mce_ignore_ce __read_mostly;
85int mce_ser __read_mostly;
86int mce_bios_cmci_threshold __read_mostly;
87
88struct mce_bank *mce_banks __read_mostly;
89 81
90/* User mode helper program triggered by machine check event */ 82/* User mode helper program triggered by machine check event */
91static unsigned long mce_need_notify; 83static unsigned long mce_need_notify;
@@ -302,7 +294,7 @@ static void wait_for_panic(void)
302 while (timeout-- > 0) 294 while (timeout-- > 0)
303 udelay(1); 295 udelay(1);
304 if (panic_timeout == 0) 296 if (panic_timeout == 0)
305 panic_timeout = mce_panic_timeout; 297 panic_timeout = mca_cfg.panic_timeout;
306 panic("Panicing machine check CPU died"); 298 panic("Panicing machine check CPU died");
307} 299}
308 300
@@ -360,7 +352,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
360 pr_emerg(HW_ERR "Machine check: %s\n", exp); 352 pr_emerg(HW_ERR "Machine check: %s\n", exp);
361 if (!fake_panic) { 353 if (!fake_panic) {
362 if (panic_timeout == 0) 354 if (panic_timeout == 0)
363 panic_timeout = mce_panic_timeout; 355 panic_timeout = mca_cfg.panic_timeout;
364 panic(msg); 356 panic(msg);
365 } else 357 } else
366 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 358 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
@@ -372,7 +364,7 @@ static int msr_to_offset(u32 msr)
372{ 364{
373 unsigned bank = __this_cpu_read(injectm.bank); 365 unsigned bank = __this_cpu_read(injectm.bank);
374 366
375 if (msr == rip_msr) 367 if (msr == mca_cfg.rip_msr)
376 return offsetof(struct mce, ip); 368 return offsetof(struct mce, ip);
377 if (msr == MSR_IA32_MCx_STATUS(bank)) 369 if (msr == MSR_IA32_MCx_STATUS(bank))
378 return offsetof(struct mce, status); 370 return offsetof(struct mce, status);
@@ -451,8 +443,8 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
451 m->cs |= 3; 443 m->cs |= 3;
452 } 444 }
453 /* Use accurate RIP reporting if available. */ 445 /* Use accurate RIP reporting if available. */
454 if (rip_msr) 446 if (mca_cfg.rip_msr)
455 m->ip = mce_rdmsrl(rip_msr); 447 m->ip = mce_rdmsrl(mca_cfg.rip_msr);
456 } 448 }
457} 449}
458 450
@@ -513,7 +505,7 @@ static int mce_ring_add(unsigned long pfn)
513 505
514int mce_available(struct cpuinfo_x86 *c) 506int mce_available(struct cpuinfo_x86 *c)
515{ 507{
516 if (mce_disabled) 508 if (mca_cfg.disabled)
517 return 0; 509 return 0;
518 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 510 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
519} 511}
@@ -565,7 +557,7 @@ static void mce_read_aux(struct mce *m, int i)
565 /* 557 /*
566 * Mask the reported address by the reported granularity. 558 * Mask the reported address by the reported granularity.
567 */ 559 */
568 if (mce_ser && (m->status & MCI_STATUS_MISCV)) { 560 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
569 u8 shift = MCI_MISC_ADDR_LSB(m->misc); 561 u8 shift = MCI_MISC_ADDR_LSB(m->misc);
570 m->addr >>= shift; 562 m->addr >>= shift;
571 m->addr <<= shift; 563 m->addr <<= shift;
@@ -599,7 +591,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
599 591
600 mce_gather_info(&m, NULL); 592 mce_gather_info(&m, NULL);
601 593
602 for (i = 0; i < banks; i++) { 594 for (i = 0; i < mca_cfg.banks; i++) {
603 if (!mce_banks[i].ctl || !test_bit(i, *b)) 595 if (!mce_banks[i].ctl || !test_bit(i, *b))
604 continue; 596 continue;
605 597
@@ -620,7 +612,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
620 * TBD do the same check for MCI_STATUS_EN here? 612 * TBD do the same check for MCI_STATUS_EN here?
621 */ 613 */
622 if (!(flags & MCP_UC) && 614 if (!(flags & MCP_UC) &&
623 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 615 (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
624 continue; 616 continue;
625 617
626 mce_read_aux(&m, i); 618 mce_read_aux(&m, i);
@@ -631,7 +623,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
631 * Don't get the IP here because it's unlikely to 623 * Don't get the IP here because it's unlikely to
632 * have anything to do with the actual error location. 624 * have anything to do with the actual error location.
633 */ 625 */
634 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) 626 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
635 mce_log(&m); 627 mce_log(&m);
636 628
637 /* 629 /*
@@ -658,14 +650,14 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
658{ 650{
659 int i, ret = 0; 651 int i, ret = 0;
660 652
661 for (i = 0; i < banks; i++) { 653 for (i = 0; i < mca_cfg.banks; i++) {
662 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 654 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
663 if (m->status & MCI_STATUS_VAL) { 655 if (m->status & MCI_STATUS_VAL) {
664 __set_bit(i, validp); 656 __set_bit(i, validp);
665 if (quirk_no_way_out) 657 if (quirk_no_way_out)
666 quirk_no_way_out(i, m, regs); 658 quirk_no_way_out(i, m, regs);
667 } 659 }
668 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 660 if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY)
669 ret = 1; 661 ret = 1;
670 } 662 }
671 return ret; 663 return ret;
@@ -696,11 +688,11 @@ static int mce_timed_out(u64 *t)
696 rmb(); 688 rmb();
697 if (atomic_read(&mce_paniced)) 689 if (atomic_read(&mce_paniced))
698 wait_for_panic(); 690 wait_for_panic();
699 if (!monarch_timeout) 691 if (!mca_cfg.monarch_timeout)
700 goto out; 692 goto out;
701 if ((s64)*t < SPINUNIT) { 693 if ((s64)*t < SPINUNIT) {
702 /* CHECKME: Make panic default for 1 too? */ 694 /* CHECKME: Make panic default for 1 too? */
703 if (tolerant < 1) 695 if (mca_cfg.tolerant < 1)
704 mce_panic("Timeout synchronizing machine check over CPUs", 696 mce_panic("Timeout synchronizing machine check over CPUs",
705 NULL, NULL); 697 NULL, NULL);
706 cpu_missing = 1; 698 cpu_missing = 1;
@@ -750,7 +742,8 @@ static void mce_reign(void)
750 * Grade the severity of the errors of all the CPUs. 742 * Grade the severity of the errors of all the CPUs.
751 */ 743 */
752 for_each_possible_cpu(cpu) { 744 for_each_possible_cpu(cpu) {
753 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 745 int severity = mce_severity(&per_cpu(mces_seen, cpu),
746 mca_cfg.tolerant,
754 &nmsg); 747 &nmsg);
755 if (severity > global_worst) { 748 if (severity > global_worst) {
756 msg = nmsg; 749 msg = nmsg;
@@ -764,7 +757,7 @@ static void mce_reign(void)
764 * This dumps all the mces in the log buffer and stops the 757 * This dumps all the mces in the log buffer and stops the
765 * other CPUs. 758 * other CPUs.
766 */ 759 */
767 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 760 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
768 mce_panic("Fatal Machine check", m, msg); 761 mce_panic("Fatal Machine check", m, msg);
769 762
770 /* 763 /*
@@ -777,7 +770,7 @@ static void mce_reign(void)
777 * No machine check event found. Must be some external 770 * No machine check event found. Must be some external
778 * source or one CPU is hung. Panic. 771 * source or one CPU is hung. Panic.
779 */ 772 */
780 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 773 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
781 mce_panic("Machine check from unknown source", NULL, NULL); 774 mce_panic("Machine check from unknown source", NULL, NULL);
782 775
783 /* 776 /*
@@ -801,7 +794,7 @@ static int mce_start(int *no_way_out)
801{ 794{
802 int order; 795 int order;
803 int cpus = num_online_cpus(); 796 int cpus = num_online_cpus();
804 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 797 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
805 798
806 if (!timeout) 799 if (!timeout)
807 return -1; 800 return -1;
@@ -865,7 +858,7 @@ static int mce_start(int *no_way_out)
865static int mce_end(int order) 858static int mce_end(int order)
866{ 859{
867 int ret = -1; 860 int ret = -1;
868 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 861 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
869 862
870 if (!timeout) 863 if (!timeout)
871 goto reset; 864 goto reset;
@@ -946,7 +939,7 @@ static void mce_clear_state(unsigned long *toclear)
946{ 939{
947 int i; 940 int i;
948 941
949 for (i = 0; i < banks; i++) { 942 for (i = 0; i < mca_cfg.banks; i++) {
950 if (test_bit(i, toclear)) 943 if (test_bit(i, toclear))
951 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 944 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
952 } 945 }
@@ -1011,6 +1004,7 @@ static void mce_clear_info(struct mce_info *mi)
1011 */ 1004 */
1012void do_machine_check(struct pt_regs *regs, long error_code) 1005void do_machine_check(struct pt_regs *regs, long error_code)
1013{ 1006{
1007 struct mca_config *cfg = &mca_cfg;
1014 struct mce m, *final; 1008 struct mce m, *final;
1015 int i; 1009 int i;
1016 int worst = 0; 1010 int worst = 0;
@@ -1022,7 +1016,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1022 int order; 1016 int order;
1023 /* 1017 /*
1024 * If no_way_out gets set, there is no safe way to recover from this 1018 * If no_way_out gets set, there is no safe way to recover from this
1025 * MCE. If tolerant is cranked up, we'll try anyway. 1019 * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
1026 */ 1020 */
1027 int no_way_out = 0; 1021 int no_way_out = 0;
1028 /* 1022 /*
@@ -1038,7 +1032,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1038 1032
1039 this_cpu_inc(mce_exception_count); 1033 this_cpu_inc(mce_exception_count);
1040 1034
1041 if (!banks) 1035 if (!cfg->banks)
1042 goto out; 1036 goto out;
1043 1037
1044 mce_gather_info(&m, regs); 1038 mce_gather_info(&m, regs);
@@ -1065,7 +1059,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1065 * because the first one to see it will clear it. 1059 * because the first one to see it will clear it.
1066 */ 1060 */
1067 order = mce_start(&no_way_out); 1061 order = mce_start(&no_way_out);
1068 for (i = 0; i < banks; i++) { 1062 for (i = 0; i < cfg->banks; i++) {
1069 __clear_bit(i, toclear); 1063 __clear_bit(i, toclear);
1070 if (!test_bit(i, valid_banks)) 1064 if (!test_bit(i, valid_banks))
1071 continue; 1065 continue;
@@ -1084,7 +1078,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1084 * Non uncorrected or non signaled errors are handled by 1078 * Non uncorrected or non signaled errors are handled by
1085 * machine_check_poll. Leave them alone, unless this panics. 1079 * machine_check_poll. Leave them alone, unless this panics.
1086 */ 1080 */
1087 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 1081 if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1088 !no_way_out) 1082 !no_way_out)
1089 continue; 1083 continue;
1090 1084
@@ -1093,7 +1087,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1093 */ 1087 */
1094 add_taint(TAINT_MACHINE_CHECK); 1088 add_taint(TAINT_MACHINE_CHECK);
1095 1089
1096 severity = mce_severity(&m, tolerant, NULL); 1090 severity = mce_severity(&m, cfg->tolerant, NULL);
1097 1091
1098 /* 1092 /*
1099 * When machine check was for corrected handler don't touch, 1093 * When machine check was for corrected handler don't touch,
@@ -1117,7 +1111,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1117 * When the ring overflows we just ignore the AO error. 1111 * When the ring overflows we just ignore the AO error.
1118 * RED-PEN add some logging mechanism when 1112 * RED-PEN add some logging mechanism when
1119 * usable_address or mce_add_ring fails. 1113 * usable_address or mce_add_ring fails.
1120 * RED-PEN don't ignore overflow for tolerant == 0 1114 * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
1121 */ 1115 */
1122 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1116 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1123 mce_ring_add(m.addr >> PAGE_SHIFT); 1117 mce_ring_add(m.addr >> PAGE_SHIFT);
@@ -1149,7 +1143,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1149 * issues we try to recover, or limit damage to the current 1143 * issues we try to recover, or limit damage to the current
1150 * process. 1144 * process.
1151 */ 1145 */
1152 if (tolerant < 3) { 1146 if (cfg->tolerant < 3) {
1153 if (no_way_out) 1147 if (no_way_out)
1154 mce_panic("Fatal machine check on current CPU", &m, msg); 1148 mce_panic("Fatal machine check on current CPU", &m, msg);
1155 if (worst == MCE_AR_SEVERITY) { 1149 if (worst == MCE_AR_SEVERITY) {
@@ -1377,11 +1371,13 @@ EXPORT_SYMBOL_GPL(mce_notify_irq);
1377static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1371static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1378{ 1372{
1379 int i; 1373 int i;
1374 u8 num_banks = mca_cfg.banks;
1380 1375
1381 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1376 mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1382 if (!mce_banks) 1377 if (!mce_banks)
1383 return -ENOMEM; 1378 return -ENOMEM;
1384 for (i = 0; i < banks; i++) { 1379
1380 for (i = 0; i < num_banks; i++) {
1385 struct mce_bank *b = &mce_banks[i]; 1381 struct mce_bank *b = &mce_banks[i];
1386 1382
1387 b->ctl = -1ULL; 1383 b->ctl = -1ULL;
@@ -1401,7 +1397,7 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
1401 rdmsrl(MSR_IA32_MCG_CAP, cap); 1397 rdmsrl(MSR_IA32_MCG_CAP, cap);
1402 1398
1403 b = cap & MCG_BANKCNT_MASK; 1399 b = cap & MCG_BANKCNT_MASK;
1404 if (!banks) 1400 if (!mca_cfg.banks)
1405 pr_info("CPU supports %d MCE banks\n", b); 1401 pr_info("CPU supports %d MCE banks\n", b);
1406 1402
1407 if (b > MAX_NR_BANKS) { 1403 if (b > MAX_NR_BANKS) {
@@ -1411,8 +1407,9 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
1411 } 1407 }
1412 1408
1413 /* Don't support asymmetric configurations today */ 1409 /* Don't support asymmetric configurations today */
1414 WARN_ON(banks != 0 && b != banks); 1410 WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1415 banks = b; 1411 mca_cfg.banks = b;
1412
1416 if (!mce_banks) { 1413 if (!mce_banks) {
1417 int err = __mcheck_cpu_mce_banks_init(); 1414 int err = __mcheck_cpu_mce_banks_init();
1418 1415
@@ -1422,25 +1419,29 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
1422 1419
1423 /* Use accurate RIP reporting if available. */ 1420 /* Use accurate RIP reporting if available. */
1424 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1421 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1425 rip_msr = MSR_IA32_MCG_EIP; 1422 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1426 1423
1427 if (cap & MCG_SER_P) 1424 if (cap & MCG_SER_P)
1428 mce_ser = 1; 1425 mca_cfg.ser = true;
1429 1426
1430 return 0; 1427 return 0;
1431} 1428}
1432 1429
1433static void __mcheck_cpu_init_generic(void) 1430static void __mcheck_cpu_init_generic(void)
1434{ 1431{
1432 enum mcp_flags m_fl = 0;
1435 mce_banks_t all_banks; 1433 mce_banks_t all_banks;
1436 u64 cap; 1434 u64 cap;
1437 int i; 1435 int i;
1438 1436
1437 if (!mca_cfg.bootlog)
1438 m_fl = MCP_DONTLOG;
1439
1439 /* 1440 /*
1440 * Log the machine checks left over from the previous reset. 1441 * Log the machine checks left over from the previous reset.
1441 */ 1442 */
1442 bitmap_fill(all_banks, MAX_NR_BANKS); 1443 bitmap_fill(all_banks, MAX_NR_BANKS);
1443 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1444 machine_check_poll(MCP_UC | m_fl, &all_banks);
1444 1445
1445 set_in_cr4(X86_CR4_MCE); 1446 set_in_cr4(X86_CR4_MCE);
1446 1447
@@ -1448,7 +1449,7 @@ static void __mcheck_cpu_init_generic(void)
1448 if (cap & MCG_CTL_P) 1449 if (cap & MCG_CTL_P)
1449 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1450 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1450 1451
1451 for (i = 0; i < banks; i++) { 1452 for (i = 0; i < mca_cfg.banks; i++) {
1452 struct mce_bank *b = &mce_banks[i]; 1453 struct mce_bank *b = &mce_banks[i];
1453 1454
1454 if (!b->init) 1455 if (!b->init)
@@ -1489,6 +1490,8 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1489/* Add per CPU specific workarounds here */ 1490/* Add per CPU specific workarounds here */
1490static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1491static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1491{ 1492{
1493 struct mca_config *cfg = &mca_cfg;
1494
1492 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1495 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1493 pr_info("unknown CPU type - not enabling MCE support\n"); 1496 pr_info("unknown CPU type - not enabling MCE support\n");
1494 return -EOPNOTSUPP; 1497 return -EOPNOTSUPP;
@@ -1496,7 +1499,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1496 1499
1497 /* This should be disabled by the BIOS, but isn't always */ 1500 /* This should be disabled by the BIOS, but isn't always */
1498 if (c->x86_vendor == X86_VENDOR_AMD) { 1501 if (c->x86_vendor == X86_VENDOR_AMD) {
1499 if (c->x86 == 15 && banks > 4) { 1502 if (c->x86 == 15 && cfg->banks > 4) {
1500 /* 1503 /*
1501 * disable GART TBL walk error reporting, which 1504 * disable GART TBL walk error reporting, which
1502 * trips off incorrectly with the IOMMU & 3ware 1505 * trips off incorrectly with the IOMMU & 3ware
@@ -1504,18 +1507,18 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1504 */ 1507 */
1505 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1508 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1506 } 1509 }
1507 if (c->x86 <= 17 && mce_bootlog < 0) { 1510 if (c->x86 <= 17 && cfg->bootlog < 0) {
1508 /* 1511 /*
1509 * Lots of broken BIOS around that don't clear them 1512 * Lots of broken BIOS around that don't clear them
1510 * by default and leave crap in there. Don't log: 1513 * by default and leave crap in there. Don't log:
1511 */ 1514 */
1512 mce_bootlog = 0; 1515 cfg->bootlog = 0;
1513 } 1516 }
1514 /* 1517 /*
1515 * Various K7s with broken bank 0 around. Always disable 1518 * Various K7s with broken bank 0 around. Always disable
1516 * by default. 1519 * by default.
1517 */ 1520 */
1518 if (c->x86 == 6 && banks > 0) 1521 if (c->x86 == 6 && cfg->banks > 0)
1519 mce_banks[0].ctl = 0; 1522 mce_banks[0].ctl = 0;
1520 1523
1521 /* 1524 /*
@@ -1566,7 +1569,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1566 * valid event later, merely don't write CTL0. 1569 * valid event later, merely don't write CTL0.
1567 */ 1570 */
1568 1571
1569 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1572 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1570 mce_banks[0].init = 0; 1573 mce_banks[0].init = 0;
1571 1574
1572 /* 1575 /*
@@ -1574,23 +1577,23 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1574 * synchronization with a one second timeout. 1577 * synchronization with a one second timeout.
1575 */ 1578 */
1576 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1579 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1577 monarch_timeout < 0) 1580 cfg->monarch_timeout < 0)
1578 monarch_timeout = USEC_PER_SEC; 1581 cfg->monarch_timeout = USEC_PER_SEC;
1579 1582
1580 /* 1583 /*
1581 * There are also broken BIOSes on some Pentium M and 1584 * There are also broken BIOSes on some Pentium M and
1582 * earlier systems: 1585 * earlier systems:
1583 */ 1586 */
1584 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1587 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1585 mce_bootlog = 0; 1588 cfg->bootlog = 0;
1586 1589
1587 if (c->x86 == 6 && c->x86_model == 45) 1590 if (c->x86 == 6 && c->x86_model == 45)
1588 quirk_no_way_out = quirk_sandybridge_ifu; 1591 quirk_no_way_out = quirk_sandybridge_ifu;
1589 } 1592 }
1590 if (monarch_timeout < 0) 1593 if (cfg->monarch_timeout < 0)
1591 monarch_timeout = 0; 1594 cfg->monarch_timeout = 0;
1592 if (mce_bootlog != 0) 1595 if (cfg->bootlog != 0)
1593 mce_panic_timeout = 30; 1596 cfg->panic_timeout = 30;
1594 1597
1595 return 0; 1598 return 0;
1596} 1599}
@@ -1635,7 +1638,7 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1635 1638
1636 __this_cpu_write(mce_next_interval, iv); 1639 __this_cpu_write(mce_next_interval, iv);
1637 1640
1638 if (mce_ignore_ce || !iv) 1641 if (mca_cfg.ignore_ce || !iv)
1639 return; 1642 return;
1640 1643
1641 t->expires = round_jiffies(jiffies + iv); 1644 t->expires = round_jiffies(jiffies + iv);
@@ -1668,7 +1671,7 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
1668 */ 1671 */
1669void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1672void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1670{ 1673{
1671 if (mce_disabled) 1674 if (mca_cfg.disabled)
1672 return; 1675 return;
1673 1676
1674 if (__mcheck_cpu_ancient_init(c)) 1677 if (__mcheck_cpu_ancient_init(c))
@@ -1678,7 +1681,7 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1678 return; 1681 return;
1679 1682
1680 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1683 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1681 mce_disabled = 1; 1684 mca_cfg.disabled = true;
1682 return; 1685 return;
1683 } 1686 }
1684 1687
@@ -1951,6 +1954,8 @@ static struct miscdevice mce_chrdev_device = {
1951 */ 1954 */
1952static int __init mcheck_enable(char *str) 1955static int __init mcheck_enable(char *str)
1953{ 1956{
1957 struct mca_config *cfg = &mca_cfg;
1958
1954 if (*str == 0) { 1959 if (*str == 0) {
1955 enable_p5_mce(); 1960 enable_p5_mce();
1956 return 1; 1961 return 1;
@@ -1958,22 +1963,22 @@ static int __init mcheck_enable(char *str)
1958 if (*str == '=') 1963 if (*str == '=')
1959 str++; 1964 str++;
1960 if (!strcmp(str, "off")) 1965 if (!strcmp(str, "off"))
1961 mce_disabled = 1; 1966 cfg->disabled = true;
1962 else if (!strcmp(str, "no_cmci")) 1967 else if (!strcmp(str, "no_cmci"))
1963 mce_cmci_disabled = 1; 1968 cfg->cmci_disabled = true;
1964 else if (!strcmp(str, "dont_log_ce")) 1969 else if (!strcmp(str, "dont_log_ce"))
1965 mce_dont_log_ce = 1; 1970 cfg->dont_log_ce = true;
1966 else if (!strcmp(str, "ignore_ce")) 1971 else if (!strcmp(str, "ignore_ce"))
1967 mce_ignore_ce = 1; 1972 cfg->ignore_ce = true;
1968 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1973 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1969 mce_bootlog = (str[0] == 'b'); 1974 cfg->bootlog = (str[0] == 'b');
1970 else if (!strcmp(str, "bios_cmci_threshold")) 1975 else if (!strcmp(str, "bios_cmci_threshold"))
1971 mce_bios_cmci_threshold = 1; 1976 cfg->bios_cmci_threshold = true;
1972 else if (isdigit(str[0])) { 1977 else if (isdigit(str[0])) {
1973 get_option(&str, &tolerant); 1978 get_option(&str, &(cfg->tolerant));
1974 if (*str == ',') { 1979 if (*str == ',') {
1975 ++str; 1980 ++str;
1976 get_option(&str, &monarch_timeout); 1981 get_option(&str, &(cfg->monarch_timeout));
1977 } 1982 }
1978 } else { 1983 } else {
1979 pr_info("mce argument %s ignored. Please use /sys\n", str); 1984 pr_info("mce argument %s ignored. Please use /sys\n", str);
@@ -2002,7 +2007,7 @@ static int mce_disable_error_reporting(void)
2002{ 2007{
2003 int i; 2008 int i;
2004 2009
2005 for (i = 0; i < banks; i++) { 2010 for (i = 0; i < mca_cfg.banks; i++) {
2006 struct mce_bank *b = &mce_banks[i]; 2011 struct mce_bank *b = &mce_banks[i];
2007 2012
2008 if (b->init) 2013 if (b->init)
@@ -2142,15 +2147,15 @@ static ssize_t set_ignore_ce(struct device *s,
2142 if (strict_strtoull(buf, 0, &new) < 0) 2147 if (strict_strtoull(buf, 0, &new) < 0)
2143 return -EINVAL; 2148 return -EINVAL;
2144 2149
2145 if (mce_ignore_ce ^ !!new) { 2150 if (mca_cfg.ignore_ce ^ !!new) {
2146 if (new) { 2151 if (new) {
2147 /* disable ce features */ 2152 /* disable ce features */
2148 mce_timer_delete_all(); 2153 mce_timer_delete_all();
2149 on_each_cpu(mce_disable_cmci, NULL, 1); 2154 on_each_cpu(mce_disable_cmci, NULL, 1);
2150 mce_ignore_ce = 1; 2155 mca_cfg.ignore_ce = true;
2151 } else { 2156 } else {
2152 /* enable ce features */ 2157 /* enable ce features */
2153 mce_ignore_ce = 0; 2158 mca_cfg.ignore_ce = false;
2154 on_each_cpu(mce_enable_ce, (void *)1, 1); 2159 on_each_cpu(mce_enable_ce, (void *)1, 1);
2155 } 2160 }
2156 } 2161 }
@@ -2166,14 +2171,14 @@ static ssize_t set_cmci_disabled(struct device *s,
2166 if (strict_strtoull(buf, 0, &new) < 0) 2171 if (strict_strtoull(buf, 0, &new) < 0)
2167 return -EINVAL; 2172 return -EINVAL;
2168 2173
2169 if (mce_cmci_disabled ^ !!new) { 2174 if (mca_cfg.cmci_disabled ^ !!new) {
2170 if (new) { 2175 if (new) {
2171 /* disable cmci */ 2176 /* disable cmci */
2172 on_each_cpu(mce_disable_cmci, NULL, 1); 2177 on_each_cpu(mce_disable_cmci, NULL, 1);
2173 mce_cmci_disabled = 1; 2178 mca_cfg.cmci_disabled = true;
2174 } else { 2179 } else {
2175 /* enable cmci */ 2180 /* enable cmci */
2176 mce_cmci_disabled = 0; 2181 mca_cfg.cmci_disabled = false;
2177 on_each_cpu(mce_enable_ce, NULL, 1); 2182 on_each_cpu(mce_enable_ce, NULL, 1);
2178 } 2183 }
2179 } 2184 }
@@ -2190,9 +2195,9 @@ static ssize_t store_int_with_restart(struct device *s,
2190} 2195}
2191 2196
2192static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); 2197static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2193static DEVICE_INT_ATTR(tolerant, 0644, tolerant); 2198static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2194static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 2199static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2195static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 2200static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2196 2201
2197static struct dev_ext_attribute dev_attr_check_interval = { 2202static struct dev_ext_attribute dev_attr_check_interval = {
2198 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), 2203 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
@@ -2200,13 +2205,13 @@ static struct dev_ext_attribute dev_attr_check_interval = {
2200}; 2205};
2201 2206
2202static struct dev_ext_attribute dev_attr_ignore_ce = { 2207static struct dev_ext_attribute dev_attr_ignore_ce = {
2203 __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), 2208 __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2204 &mce_ignore_ce 2209 &mca_cfg.ignore_ce
2205}; 2210};
2206 2211
2207static struct dev_ext_attribute dev_attr_cmci_disabled = { 2212static struct dev_ext_attribute dev_attr_cmci_disabled = {
2208 __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), 2213 __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2209 &mce_cmci_disabled 2214 &mca_cfg.cmci_disabled
2210}; 2215};
2211 2216
2212static struct device_attribute *mce_device_attrs[] = { 2217static struct device_attribute *mce_device_attrs[] = {
@@ -2253,7 +2258,7 @@ static __cpuinit int mce_device_create(unsigned int cpu)
2253 if (err) 2258 if (err)
2254 goto error; 2259 goto error;
2255 } 2260 }
2256 for (j = 0; j < banks; j++) { 2261 for (j = 0; j < mca_cfg.banks; j++) {
2257 err = device_create_file(dev, &mce_banks[j].attr); 2262 err = device_create_file(dev, &mce_banks[j].attr);
2258 if (err) 2263 if (err)
2259 goto error2; 2264 goto error2;
@@ -2285,7 +2290,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu)
2285 for (i = 0; mce_device_attrs[i]; i++) 2290 for (i = 0; mce_device_attrs[i]; i++)
2286 device_remove_file(dev, mce_device_attrs[i]); 2291 device_remove_file(dev, mce_device_attrs[i]);
2287 2292
2288 for (i = 0; i < banks; i++) 2293 for (i = 0; i < mca_cfg.banks; i++)
2289 device_remove_file(dev, &mce_banks[i].attr); 2294 device_remove_file(dev, &mce_banks[i].attr);
2290 2295
2291 device_unregister(dev); 2296 device_unregister(dev);
@@ -2304,7 +2309,7 @@ static void __cpuinit mce_disable_cpu(void *h)
2304 2309
2305 if (!(action & CPU_TASKS_FROZEN)) 2310 if (!(action & CPU_TASKS_FROZEN))
2306 cmci_clear(); 2311 cmci_clear();
2307 for (i = 0; i < banks; i++) { 2312 for (i = 0; i < mca_cfg.banks; i++) {
2308 struct mce_bank *b = &mce_banks[i]; 2313 struct mce_bank *b = &mce_banks[i];
2309 2314
2310 if (b->init) 2315 if (b->init)
@@ -2322,7 +2327,7 @@ static void __cpuinit mce_reenable_cpu(void *h)
2322 2327
2323 if (!(action & CPU_TASKS_FROZEN)) 2328 if (!(action & CPU_TASKS_FROZEN))
2324 cmci_reenable(); 2329 cmci_reenable();
2325 for (i = 0; i < banks; i++) { 2330 for (i = 0; i < mca_cfg.banks; i++) {
2326 struct mce_bank *b = &mce_banks[i]; 2331 struct mce_bank *b = &mce_banks[i];
2327 2332
2328 if (b->init) 2333 if (b->init)
@@ -2375,7 +2380,7 @@ static __init void mce_init_banks(void)
2375{ 2380{
2376 int i; 2381 int i;
2377 2382
2378 for (i = 0; i < banks; i++) { 2383 for (i = 0; i < mca_cfg.banks; i++) {
2379 struct mce_bank *b = &mce_banks[i]; 2384 struct mce_bank *b = &mce_banks[i];
2380 struct device_attribute *a = &b->attr; 2385 struct device_attribute *a = &b->attr;
2381 2386
@@ -2426,7 +2431,7 @@ device_initcall_sync(mcheck_init_device);
2426 */ 2431 */
2427static int __init mcheck_disable(char *str) 2432static int __init mcheck_disable(char *str)
2428{ 2433{
2429 mce_disabled = 1; 2434 mca_cfg.disabled = true;
2430 return 1; 2435 return 1;
2431} 2436}
2432__setup("nomce", mcheck_disable); 2437__setup("nomce", mcheck_disable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 698b6ec12e0f..1ac581f38dfa 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -6,7 +6,7 @@
6 * 6 *
7 * Written by Jacob Shin - AMD, Inc. 7 * Written by Jacob Shin - AMD, Inc.
8 * 8 *
9 * Support: borislav.petkov@amd.com 9 * Maintained by: Borislav Petkov <bp@alien8.de>
10 * 10 *
11 * April 2006 11 * April 2006
12 * - added support for AMD Family 0x10 processors 12 * - added support for AMD Family 0x10 processors
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 5f88abf07e9c..402c454fbff0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -53,7 +53,7 @@ static int cmci_supported(int *banks)
53{ 53{
54 u64 cap; 54 u64 cap;
55 55
56 if (mce_cmci_disabled || mce_ignore_ce) 56 if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
57 return 0; 57 return 0;
58 58
59 /* 59 /*
@@ -200,7 +200,7 @@ static void cmci_discover(int banks)
200 continue; 200 continue;
201 } 201 }
202 202
203 if (!mce_bios_cmci_threshold) { 203 if (!mca_cfg.bios_cmci_threshold) {
204 val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; 204 val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
205 val |= CMCI_THRESHOLD; 205 val |= CMCI_THRESHOLD;
206 } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { 206 } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
@@ -227,7 +227,7 @@ static void cmci_discover(int banks)
227 * set the thresholds properly or does not work with 227 * set the thresholds properly or does not work with
228 * this boot option. Note down now and report later. 228 * this boot option. Note down now and report later.
229 */ 229 */
230 if (mce_bios_cmci_threshold && bios_zero_thresh && 230 if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
231 (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) 231 (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
232 bios_wrong_thresh = 1; 232 bios_wrong_thresh = 1;
233 } else { 233 } else {
@@ -235,7 +235,7 @@ static void cmci_discover(int banks)
235 } 235 }
236 } 236 }
237 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); 237 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
238 if (mce_bios_cmci_threshold && bios_wrong_thresh) { 238 if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
239 pr_info_once( 239 pr_info_once(
240 "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); 240 "bios_cmci_threshold: Some banks do not have valid thresholds set\n");
241 pr_info_once( 241 pr_info_once(
@@ -285,34 +285,39 @@ void cmci_clear(void)
285 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); 285 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
286} 286}
287 287
288static long cmci_rediscover_work_func(void *arg)
289{
290 int banks;
291
292 /* Recheck banks in case CPUs don't all have the same */
293 if (cmci_supported(&banks))
294 cmci_discover(banks);
295
296 return 0;
297}
298
288/* 299/*
289 * After a CPU went down cycle through all the others and rediscover 300 * After a CPU went down cycle through all the others and rediscover
290 * Must run in process context. 301 * Must run in process context.
291 */ 302 */
292void cmci_rediscover(int dying) 303void cmci_rediscover(int dying)
293{ 304{
294 int banks; 305 int cpu, banks;
295 int cpu;
296 cpumask_var_t old;
297 306
298 if (!cmci_supported(&banks)) 307 if (!cmci_supported(&banks))
299 return; 308 return;
300 if (!alloc_cpumask_var(&old, GFP_KERNEL))
301 return;
302 cpumask_copy(old, &current->cpus_allowed);
303 309
304 for_each_online_cpu(cpu) { 310 for_each_online_cpu(cpu) {
305 if (cpu == dying) 311 if (cpu == dying)
306 continue; 312 continue;
307 if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) 313
314 if (cpu == smp_processor_id()) {
315 cmci_rediscover_work_func(NULL);
308 continue; 316 continue;
309 /* Recheck banks in case CPUs don't all have the same */ 317 }
310 if (cmci_supported(&banks))
311 cmci_discover(banks);
312 }
313 318
314 set_cpus_allowed_ptr(current, old); 319 work_on_cpu(cpu, cmci_rediscover_work_func, NULL);
315 free_cpumask_var(old); 320 }
316} 321}
317 322
318/* 323/*
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 6b96110bb0c3..726bf963c227 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -606,7 +606,7 @@ void __init mtrr_bp_init(void)
606 606
607 /* 607 /*
608 * This is an AMD specific MSR, but we assume(hope?) that 608 * This is an AMD specific MSR, but we assume(hope?) that
609 * Intel will implement it to when they extend the address 609 * Intel will implement it too when they extend the address
610 * bus of the Xeon. 610 * bus of the Xeon.
611 */ 611 */
612 if (cpuid_eax(0x80000000) >= 0x80000008) { 612 if (cpuid_eax(0x80000000) >= 0x80000008) {
@@ -695,11 +695,16 @@ void mtrr_ap_init(void)
695} 695}
696 696
697/** 697/**
698 * Save current fixed-range MTRR state of the BSP 698 * Save current fixed-range MTRR state of the first cpu in cpu_online_mask.
699 */ 699 */
700void mtrr_save_state(void) 700void mtrr_save_state(void)
701{ 701{
702 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); 702 int first_cpu;
703
704 get_online_cpus();
705 first_cpu = cpumask_first(cpu_online_mask);
706 smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
707 put_online_cpus();
703} 708}
704 709
705void set_mtrr_aps_delayed_init(void) 710void set_mtrr_aps_delayed_init(void)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 4a3374e61a93..6774c17a5576 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -340,9 +340,6 @@ int x86_setup_perfctr(struct perf_event *event)
340 /* BTS is currently only allowed for user-mode. */ 340 /* BTS is currently only allowed for user-mode. */
341 if (!attr->exclude_kernel) 341 if (!attr->exclude_kernel)
342 return -EOPNOTSUPP; 342 return -EOPNOTSUPP;
343
344 if (!attr->exclude_guest)
345 return -EOPNOTSUPP;
346 } 343 }
347 344
348 hwc->config |= config; 345 hwc->config |= config;
@@ -385,9 +382,6 @@ int x86_pmu_hw_config(struct perf_event *event)
385 if (event->attr.precise_ip) { 382 if (event->attr.precise_ip) {
386 int precise = 0; 383 int precise = 0;
387 384
388 if (!event->attr.exclude_guest)
389 return -EOPNOTSUPP;
390
391 /* Support for constant skid */ 385 /* Support for constant skid */
392 if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { 386 if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
393 precise++; 387 precise++;
@@ -1316,6 +1310,121 @@ static struct attribute_group x86_pmu_format_group = {
1316 .attrs = NULL, 1310 .attrs = NULL,
1317}; 1311};
1318 1312
1313struct perf_pmu_events_attr {
1314 struct device_attribute attr;
1315 u64 id;
1316};
1317
1318/*
1319 * Remove all undefined events (x86_pmu.event_map(id) == 0)
1320 * out of events_attr attributes.
1321 */
1322static void __init filter_events(struct attribute **attrs)
1323{
1324 int i, j;
1325
1326 for (i = 0; attrs[i]; i++) {
1327 if (x86_pmu.event_map(i))
1328 continue;
1329
1330 for (j = i; attrs[j]; j++)
1331 attrs[j] = attrs[j + 1];
1332
1333 /* Check the shifted attr. */
1334 i--;
1335 }
1336}
1337
1338static ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
1339 char *page)
1340{
1341 struct perf_pmu_events_attr *pmu_attr = \
1342 container_of(attr, struct perf_pmu_events_attr, attr);
1343
1344 u64 config = x86_pmu.event_map(pmu_attr->id);
1345 return x86_pmu.events_sysfs_show(page, config);
1346}
1347
1348#define EVENT_VAR(_id) event_attr_##_id
1349#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
1350
1351#define EVENT_ATTR(_name, _id) \
1352static struct perf_pmu_events_attr EVENT_VAR(_id) = { \
1353 .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \
1354 .id = PERF_COUNT_HW_##_id, \
1355};
1356
1357EVENT_ATTR(cpu-cycles, CPU_CYCLES );
1358EVENT_ATTR(instructions, INSTRUCTIONS );
1359EVENT_ATTR(cache-references, CACHE_REFERENCES );
1360EVENT_ATTR(cache-misses, CACHE_MISSES );
1361EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS );
1362EVENT_ATTR(branch-misses, BRANCH_MISSES );
1363EVENT_ATTR(bus-cycles, BUS_CYCLES );
1364EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND );
1365EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND );
1366EVENT_ATTR(ref-cycles, REF_CPU_CYCLES );
1367
1368static struct attribute *empty_attrs;
1369
1370static struct attribute *events_attr[] = {
1371 EVENT_PTR(CPU_CYCLES),
1372 EVENT_PTR(INSTRUCTIONS),
1373 EVENT_PTR(CACHE_REFERENCES),
1374 EVENT_PTR(CACHE_MISSES),
1375 EVENT_PTR(BRANCH_INSTRUCTIONS),
1376 EVENT_PTR(BRANCH_MISSES),
1377 EVENT_PTR(BUS_CYCLES),
1378 EVENT_PTR(STALLED_CYCLES_FRONTEND),
1379 EVENT_PTR(STALLED_CYCLES_BACKEND),
1380 EVENT_PTR(REF_CPU_CYCLES),
1381 NULL,
1382};
1383
1384static struct attribute_group x86_pmu_events_group = {
1385 .name = "events",
1386 .attrs = events_attr,
1387};
1388
1389ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
1390{
1391 u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
1392 u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
1393 bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE);
1394 bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
1395 bool any = (config & ARCH_PERFMON_EVENTSEL_ANY);
1396 bool inv = (config & ARCH_PERFMON_EVENTSEL_INV);
1397 ssize_t ret;
1398
1399 /*
1400 * We have whole page size to spend and just little data
1401 * to write, so we can safely use sprintf.
1402 */
1403 ret = sprintf(page, "event=0x%02llx", event);
1404
1405 if (umask)
1406 ret += sprintf(page + ret, ",umask=0x%02llx", umask);
1407
1408 if (edge)
1409 ret += sprintf(page + ret, ",edge");
1410
1411 if (pc)
1412 ret += sprintf(page + ret, ",pc");
1413
1414 if (any)
1415 ret += sprintf(page + ret, ",any");
1416
1417 if (inv)
1418 ret += sprintf(page + ret, ",inv");
1419
1420 if (cmask)
1421 ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
1422
1423 ret += sprintf(page + ret, "\n");
1424
1425 return ret;
1426}
1427
1319static int __init init_hw_perf_events(void) 1428static int __init init_hw_perf_events(void)
1320{ 1429{
1321 struct x86_pmu_quirk *quirk; 1430 struct x86_pmu_quirk *quirk;
@@ -1362,6 +1471,11 @@ static int __init init_hw_perf_events(void)
1362 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ 1471 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
1363 x86_pmu_format_group.attrs = x86_pmu.format_attrs; 1472 x86_pmu_format_group.attrs = x86_pmu.format_attrs;
1364 1473
1474 if (!x86_pmu.events_sysfs_show)
1475 x86_pmu_events_group.attrs = &empty_attrs;
1476 else
1477 filter_events(x86_pmu_events_group.attrs);
1478
1365 pr_info("... version: %d\n", x86_pmu.version); 1479 pr_info("... version: %d\n", x86_pmu.version);
1366 pr_info("... bit width: %d\n", x86_pmu.cntval_bits); 1480 pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
1367 pr_info("... generic registers: %d\n", x86_pmu.num_counters); 1481 pr_info("... generic registers: %d\n", x86_pmu.num_counters);
@@ -1651,6 +1765,7 @@ static struct attribute_group x86_pmu_attr_group = {
1651static const struct attribute_group *x86_pmu_attr_groups[] = { 1765static const struct attribute_group *x86_pmu_attr_groups[] = {
1652 &x86_pmu_attr_group, 1766 &x86_pmu_attr_group,
1653 &x86_pmu_format_group, 1767 &x86_pmu_format_group,
1768 &x86_pmu_events_group,
1654 NULL, 1769 NULL,
1655}; 1770};
1656 1771
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 271d25700297..115c1ea97746 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -354,6 +354,8 @@ struct x86_pmu {
354 int attr_rdpmc; 354 int attr_rdpmc;
355 struct attribute **format_attrs; 355 struct attribute **format_attrs;
356 356
357 ssize_t (*events_sysfs_show)(char *page, u64 config);
358
357 /* 359 /*
358 * CPU Hotplug hooks 360 * CPU Hotplug hooks
359 */ 361 */
@@ -536,6 +538,9 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
536 regs->ip = ip; 538 regs->ip = ip;
537} 539}
538 540
541ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
542ssize_t intel_event_sysfs_show(char *page, u64 config);
543
539#ifdef CONFIG_CPU_SUP_AMD 544#ifdef CONFIG_CPU_SUP_AMD
540 545
541int amd_pmu_init(void); 546int amd_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 4528ae7b6ec4..c93bc4e813a0 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -568,6 +568,14 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev
568 } 568 }
569} 569}
570 570
571static ssize_t amd_event_sysfs_show(char *page, u64 config)
572{
573 u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
574 (config & AMD64_EVENTSEL_EVENT) >> 24;
575
576 return x86_event_sysfs_show(page, config, event);
577}
578
571static __initconst const struct x86_pmu amd_pmu = { 579static __initconst const struct x86_pmu amd_pmu = {
572 .name = "AMD", 580 .name = "AMD",
573 .handle_irq = x86_pmu_handle_irq, 581 .handle_irq = x86_pmu_handle_irq,
@@ -591,6 +599,7 @@ static __initconst const struct x86_pmu amd_pmu = {
591 .put_event_constraints = amd_put_event_constraints, 599 .put_event_constraints = amd_put_event_constraints,
592 600
593 .format_attrs = amd_format_attr, 601 .format_attrs = amd_format_attr,
602 .events_sysfs_show = amd_event_sysfs_show,
594 603
595 .cpu_prepare = amd_pmu_cpu_prepare, 604 .cpu_prepare = amd_pmu_cpu_prepare,
596 .cpu_starting = amd_pmu_cpu_starting, 605 .cpu_starting = amd_pmu_cpu_starting,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 324bb523d9d9..93b9e1181f83 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1603,6 +1603,13 @@ static struct attribute *intel_arch_formats_attr[] = {
1603 NULL, 1603 NULL,
1604}; 1604};
1605 1605
1606ssize_t intel_event_sysfs_show(char *page, u64 config)
1607{
1608 u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
1609
1610 return x86_event_sysfs_show(page, config, event);
1611}
1612
1606static __initconst const struct x86_pmu core_pmu = { 1613static __initconst const struct x86_pmu core_pmu = {
1607 .name = "core", 1614 .name = "core",
1608 .handle_irq = x86_pmu_handle_irq, 1615 .handle_irq = x86_pmu_handle_irq,
@@ -1628,6 +1635,7 @@ static __initconst const struct x86_pmu core_pmu = {
1628 .event_constraints = intel_core_event_constraints, 1635 .event_constraints = intel_core_event_constraints,
1629 .guest_get_msrs = core_guest_get_msrs, 1636 .guest_get_msrs = core_guest_get_msrs,
1630 .format_attrs = intel_arch_formats_attr, 1637 .format_attrs = intel_arch_formats_attr,
1638 .events_sysfs_show = intel_event_sysfs_show,
1631}; 1639};
1632 1640
1633struct intel_shared_regs *allocate_shared_regs(int cpu) 1641struct intel_shared_regs *allocate_shared_regs(int cpu)
@@ -1766,6 +1774,7 @@ static __initconst const struct x86_pmu intel_pmu = {
1766 .pebs_aliases = intel_pebs_aliases_core2, 1774 .pebs_aliases = intel_pebs_aliases_core2,
1767 1775
1768 .format_attrs = intel_arch3_formats_attr, 1776 .format_attrs = intel_arch3_formats_attr,
1777 .events_sysfs_show = intel_event_sysfs_show,
1769 1778
1770 .cpu_prepare = intel_pmu_cpu_prepare, 1779 .cpu_prepare = intel_pmu_cpu_prepare,
1771 .cpu_starting = intel_pmu_cpu_starting, 1780 .cpu_starting = intel_pmu_cpu_starting,
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 3cf3d97cce3a..b43200dbfe7e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -2500,7 +2500,7 @@ static bool pcidrv_registered;
2500/* 2500/*
2501 * add a pci uncore device 2501 * add a pci uncore device
2502 */ 2502 */
2503static int __devinit uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) 2503static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev)
2504{ 2504{
2505 struct intel_uncore_pmu *pmu; 2505 struct intel_uncore_pmu *pmu;
2506 struct intel_uncore_box *box; 2506 struct intel_uncore_box *box;
@@ -2571,8 +2571,8 @@ static void uncore_pci_remove(struct pci_dev *pdev)
2571 kfree(box); 2571 kfree(box);
2572} 2572}
2573 2573
2574static int __devinit uncore_pci_probe(struct pci_dev *pdev, 2574static int uncore_pci_probe(struct pci_dev *pdev,
2575 const struct pci_device_id *id) 2575 const struct pci_device_id *id)
2576{ 2576{
2577 struct intel_uncore_type *type; 2577 struct intel_uncore_type *type;
2578 2578
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 7d0270bd793e..f2af39f5dc3d 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -227,6 +227,8 @@ static __initconst const struct x86_pmu p6_pmu = {
227 .event_constraints = p6_event_constraints, 227 .event_constraints = p6_event_constraints,
228 228
229 .format_attrs = intel_p6_formats_attr, 229 .format_attrs = intel_p6_formats_attr,
230 .events_sysfs_show = intel_event_sysfs_show,
231
230}; 232};
231 233
232__init int p6_pmu_init(void) 234__init int p6_pmu_init(void)
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index fbd895562292..3286a92e662a 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -26,11 +26,6 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
26#ifdef CONFIG_X86_32 26#ifdef CONFIG_X86_32
27static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) 27static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
28{ 28{
29 /*
30 * We use exception 16 if we have hardware math and we've either seen
31 * it or the CPU claims it is internal
32 */
33 int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu);
34 seq_printf(m, 29 seq_printf(m,
35 "fdiv_bug\t: %s\n" 30 "fdiv_bug\t: %s\n"
36 "hlt_bug\t\t: %s\n" 31 "hlt_bug\t\t: %s\n"
@@ -45,7 +40,7 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
45 c->f00f_bug ? "yes" : "no", 40 c->f00f_bug ? "yes" : "no",
46 c->coma_bug ? "yes" : "no", 41 c->coma_bug ? "yes" : "no",
47 c->hard_math ? "yes" : "no", 42 c->hard_math ? "yes" : "no",
48 fpu_exception ? "yes" : "no", 43 c->hard_math ? "yes" : "no",
49 c->cpuid_level, 44 c->cpuid_level,
50 c->wp_works_ok ? "yes" : "no"); 45 c->wp_works_ok ? "yes" : "no");
51} 46}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 13ad89971d47..74467feb4dc5 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -16,6 +16,7 @@
16#include <linux/delay.h> 16#include <linux/delay.h>
17#include <linux/elf.h> 17#include <linux/elf.h>
18#include <linux/elfcore.h> 18#include <linux/elfcore.h>
19#include <linux/module.h>
19 20
20#include <asm/processor.h> 21#include <asm/processor.h>
21#include <asm/hardirq.h> 22#include <asm/hardirq.h>
@@ -30,6 +31,27 @@
30 31
31int in_crash_kexec; 32int in_crash_kexec;
32 33
34/*
35 * This is used to VMCLEAR all VMCSs loaded on the
36 * processor. And when loading kvm_intel module, the
37 * callback function pointer will be assigned.
38 *
39 * protected by rcu.
40 */
41crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
42EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
43
44static inline void cpu_crash_vmclear_loaded_vmcss(void)
45{
46 crash_vmclear_fn *do_vmclear_operation = NULL;
47
48 rcu_read_lock();
49 do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
50 if (do_vmclear_operation)
51 do_vmclear_operation();
52 rcu_read_unlock();
53}
54
33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 55#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
34 56
35static void kdump_nmi_callback(int cpu, struct pt_regs *regs) 57static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
@@ -46,6 +68,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
46#endif 68#endif
47 crash_save_cpu(regs, cpu); 69 crash_save_cpu(regs, cpu);
48 70
71 /*
72 * VMCLEAR VMCSs loaded on all cpus if needed.
73 */
74 cpu_crash_vmclear_loaded_vmcss();
75
49 /* Disable VMX or SVM if needed. 76 /* Disable VMX or SVM if needed.
50 * 77 *
51 * We need to disable virtualization on all CPUs. 78 * We need to disable virtualization on all CPUs.
@@ -88,6 +115,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
88 115
89 kdump_nmi_shootdown_cpus(); 116 kdump_nmi_shootdown_cpus();
90 117
118 /*
119 * VMCLEAR VMCSs loaded on this cpu if needed.
120 */
121 cpu_crash_vmclear_loaded_vmcss();
122
91 /* Booting kdump kernel with VMX or SVM enabled won't work, 123 /* Booting kdump kernel with VMX or SVM enabled won't work,
92 * because (among other limitations) we can't disable paging 124 * because (among other limitations) we can't disable paging
93 * with the virt flags. 125 * with the virt flags.
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 88b725aa1d52..6ed91d9980e2 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -739,30 +739,11 @@ ENTRY(ptregs_##name) ; \
739ENDPROC(ptregs_##name) 739ENDPROC(ptregs_##name)
740 740
741PTREGSCALL1(iopl) 741PTREGSCALL1(iopl)
742PTREGSCALL0(fork)
743PTREGSCALL0(vfork)
744PTREGSCALL2(sigaltstack)
745PTREGSCALL0(sigreturn) 742PTREGSCALL0(sigreturn)
746PTREGSCALL0(rt_sigreturn) 743PTREGSCALL0(rt_sigreturn)
747PTREGSCALL2(vm86) 744PTREGSCALL2(vm86)
748PTREGSCALL1(vm86old) 745PTREGSCALL1(vm86old)
749 746
750/* Clone is an oddball. The 4th arg is in %edi */
751ENTRY(ptregs_clone)
752 CFI_STARTPROC
753 leal 4(%esp),%eax
754 pushl_cfi %eax
755 pushl_cfi PT_EDI(%eax)
756 movl PT_EDX(%eax),%ecx
757 movl PT_ECX(%eax),%edx
758 movl PT_EBX(%eax),%eax
759 call sys_clone
760 addl $8,%esp
761 CFI_ADJUST_CFA_OFFSET -8
762 ret
763 CFI_ENDPROC
764ENDPROC(ptregs_clone)
765
766.macro FIXUP_ESPFIX_STACK 747.macro FIXUP_ESPFIX_STACK
767/* 748/*
768 * Switch back for ESPFIX stack to the normal zerobased stack 749 * Switch back for ESPFIX stack to the normal zerobased stack
@@ -1084,7 +1065,6 @@ ENTRY(xen_failsafe_callback)
1084 lea 16(%esp),%esp 1065 lea 16(%esp),%esp
1085 CFI_ADJUST_CFA_OFFSET -16 1066 CFI_ADJUST_CFA_OFFSET -16
1086 jz 5f 1067 jz 5f
1087 addl $16,%esp
1088 jmp iret_exc 1068 jmp iret_exc
10895: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ 10695: pushl_cfi $-1 /* orig_ax = -1 => not a system call */
1090 SAVE_ALL 1070 SAVE_ALL
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b51b2c7ee51f..07a7a04529bc 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -56,7 +56,7 @@
56#include <asm/ftrace.h> 56#include <asm/ftrace.h>
57#include <asm/percpu.h> 57#include <asm/percpu.h>
58#include <asm/asm.h> 58#include <asm/asm.h>
59#include <asm/rcu.h> 59#include <asm/context_tracking.h>
60#include <asm/smap.h> 60#include <asm/smap.h>
61#include <linux/err.h> 61#include <linux/err.h>
62 62
@@ -845,10 +845,25 @@ ENTRY(\label)
845END(\label) 845END(\label)
846 .endm 846 .endm
847 847
848 PTREGSCALL stub_clone, sys_clone, %r8 848 .macro FORK_LIKE func
849 PTREGSCALL stub_fork, sys_fork, %rdi 849ENTRY(stub_\func)
850 PTREGSCALL stub_vfork, sys_vfork, %rdi 850 CFI_STARTPROC
851 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx 851 popq %r11 /* save return address */
852 PARTIAL_FRAME 0
853 SAVE_REST
854 pushq %r11 /* put it back on stack */
855 FIXUP_TOP_OF_STACK %r11, 8
856 DEFAULT_FRAME 0 8 /* offset 8: return address */
857 call sys_\func
858 RESTORE_TOP_OF_STACK %r11, 8
859 ret $REST_SKIP /* pop extended registers */
860 CFI_ENDPROC
861END(stub_\func)
862 .endm
863
864 FORK_LIKE clone
865 FORK_LIKE fork
866 FORK_LIKE vfork
852 PTREGSCALL stub_iopl, sys_iopl, %rsi 867 PTREGSCALL stub_iopl, sys_iopl, %rsi
853 868
854ENTRY(ptregscall_common) 869ENTRY(ptregscall_common)
@@ -897,8 +912,6 @@ ENTRY(stub_rt_sigreturn)
897END(stub_rt_sigreturn) 912END(stub_rt_sigreturn)
898 913
899#ifdef CONFIG_X86_X32_ABI 914#ifdef CONFIG_X86_X32_ABI
900 PTREGSCALL stub_x32_sigaltstack, sys32_sigaltstack, %rdx
901
902ENTRY(stub_x32_rt_sigreturn) 915ENTRY(stub_x32_rt_sigreturn)
903 CFI_STARTPROC 916 CFI_STARTPROC
904 addq $8, %rsp 917 addq $8, %rsp
@@ -995,8 +1008,8 @@ END(interrupt)
995 */ 1008 */
996 .p2align CONFIG_X86_L1_CACHE_SHIFT 1009 .p2align CONFIG_X86_L1_CACHE_SHIFT
997common_interrupt: 1010common_interrupt:
998 ASM_CLAC
999 XCPT_FRAME 1011 XCPT_FRAME
1012 ASM_CLAC
1000 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ 1013 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
1001 interrupt do_IRQ 1014 interrupt do_IRQ
1002 /* 0(%rsp): old_rsp-ARGOFFSET */ 1015 /* 0(%rsp): old_rsp-ARGOFFSET */
@@ -1135,8 +1148,8 @@ END(common_interrupt)
1135 */ 1148 */
1136.macro apicinterrupt num sym do_sym 1149.macro apicinterrupt num sym do_sym
1137ENTRY(\sym) 1150ENTRY(\sym)
1138 ASM_CLAC
1139 INTR_FRAME 1151 INTR_FRAME
1152 ASM_CLAC
1140 pushq_cfi $~(\num) 1153 pushq_cfi $~(\num)
1141.Lcommon_\sym: 1154.Lcommon_\sym:
1142 interrupt \do_sym 1155 interrupt \do_sym
@@ -1190,8 +1203,8 @@ apicinterrupt IRQ_WORK_VECTOR \
1190 */ 1203 */
1191.macro zeroentry sym do_sym 1204.macro zeroentry sym do_sym
1192ENTRY(\sym) 1205ENTRY(\sym)
1193 ASM_CLAC
1194 INTR_FRAME 1206 INTR_FRAME
1207 ASM_CLAC
1195 PARAVIRT_ADJUST_EXCEPTION_FRAME 1208 PARAVIRT_ADJUST_EXCEPTION_FRAME
1196 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1209 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1197 subq $ORIG_RAX-R15, %rsp 1210 subq $ORIG_RAX-R15, %rsp
@@ -1208,8 +1221,8 @@ END(\sym)
1208 1221
1209.macro paranoidzeroentry sym do_sym 1222.macro paranoidzeroentry sym do_sym
1210ENTRY(\sym) 1223ENTRY(\sym)
1211 ASM_CLAC
1212 INTR_FRAME 1224 INTR_FRAME
1225 ASM_CLAC
1213 PARAVIRT_ADJUST_EXCEPTION_FRAME 1226 PARAVIRT_ADJUST_EXCEPTION_FRAME
1214 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1227 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1215 subq $ORIG_RAX-R15, %rsp 1228 subq $ORIG_RAX-R15, %rsp
@@ -1227,8 +1240,8 @@ END(\sym)
1227#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) 1240#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
1228.macro paranoidzeroentry_ist sym do_sym ist 1241.macro paranoidzeroentry_ist sym do_sym ist
1229ENTRY(\sym) 1242ENTRY(\sym)
1230 ASM_CLAC
1231 INTR_FRAME 1243 INTR_FRAME
1244 ASM_CLAC
1232 PARAVIRT_ADJUST_EXCEPTION_FRAME 1245 PARAVIRT_ADJUST_EXCEPTION_FRAME
1233 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1246 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1234 subq $ORIG_RAX-R15, %rsp 1247 subq $ORIG_RAX-R15, %rsp
@@ -1247,8 +1260,8 @@ END(\sym)
1247 1260
1248.macro errorentry sym do_sym 1261.macro errorentry sym do_sym
1249ENTRY(\sym) 1262ENTRY(\sym)
1250 ASM_CLAC
1251 XCPT_FRAME 1263 XCPT_FRAME
1264 ASM_CLAC
1252 PARAVIRT_ADJUST_EXCEPTION_FRAME 1265 PARAVIRT_ADJUST_EXCEPTION_FRAME
1253 subq $ORIG_RAX-R15, %rsp 1266 subq $ORIG_RAX-R15, %rsp
1254 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1267 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
@@ -1266,8 +1279,8 @@ END(\sym)
1266 /* error code is on the stack already */ 1279 /* error code is on the stack already */
1267.macro paranoiderrorentry sym do_sym 1280.macro paranoiderrorentry sym do_sym
1268ENTRY(\sym) 1281ENTRY(\sym)
1269 ASM_CLAC
1270 XCPT_FRAME 1282 XCPT_FRAME
1283 ASM_CLAC
1271 PARAVIRT_ADJUST_EXCEPTION_FRAME 1284 PARAVIRT_ADJUST_EXCEPTION_FRAME
1272 subq $ORIG_RAX-R15, %rsp 1285 subq $ORIG_RAX-R15, %rsp
1273 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1286 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
@@ -1699,9 +1712,10 @@ nested_nmi:
1699 1712
17001: 17131:
1701 /* Set up the interrupted NMIs stack to jump to repeat_nmi */ 1714 /* Set up the interrupted NMIs stack to jump to repeat_nmi */
1702 leaq -6*8(%rsp), %rdx 1715 leaq -1*8(%rsp), %rdx
1703 movq %rdx, %rsp 1716 movq %rdx, %rsp
1704 CFI_ADJUST_CFA_OFFSET 6*8 1717 CFI_ADJUST_CFA_OFFSET 1*8
1718 leaq -10*8(%rsp), %rdx
1705 pushq_cfi $__KERNEL_DS 1719 pushq_cfi $__KERNEL_DS
1706 pushq_cfi %rdx 1720 pushq_cfi %rdx
1707 pushfq_cfi 1721 pushfq_cfi
@@ -1709,8 +1723,8 @@ nested_nmi:
1709 pushq_cfi $repeat_nmi 1723 pushq_cfi $repeat_nmi
1710 1724
1711 /* Put stack back */ 1725 /* Put stack back */
1712 addq $(11*8), %rsp 1726 addq $(6*8), %rsp
1713 CFI_ADJUST_CFA_OFFSET -11*8 1727 CFI_ADJUST_CFA_OFFSET -6*8
1714 1728
1715nested_nmi_out: 1729nested_nmi_out:
1716 popq_cfi %rdx 1730 popq_cfi %rdx
@@ -1736,18 +1750,18 @@ first_nmi:
1736 * +-------------------------+ 1750 * +-------------------------+
1737 * | NMI executing variable | 1751 * | NMI executing variable |
1738 * +-------------------------+ 1752 * +-------------------------+
1739 * | Saved SS |
1740 * | Saved Return RSP |
1741 * | Saved RFLAGS |
1742 * | Saved CS |
1743 * | Saved RIP |
1744 * +-------------------------+
1745 * | copied SS | 1753 * | copied SS |
1746 * | copied Return RSP | 1754 * | copied Return RSP |
1747 * | copied RFLAGS | 1755 * | copied RFLAGS |
1748 * | copied CS | 1756 * | copied CS |
1749 * | copied RIP | 1757 * | copied RIP |
1750 * +-------------------------+ 1758 * +-------------------------+
1759 * | Saved SS |
1760 * | Saved Return RSP |
1761 * | Saved RFLAGS |
1762 * | Saved CS |
1763 * | Saved RIP |
1764 * +-------------------------+
1751 * | pt_regs | 1765 * | pt_regs |
1752 * +-------------------------+ 1766 * +-------------------------+
1753 * 1767 *
@@ -1763,9 +1777,14 @@ first_nmi:
1763 /* Set the NMI executing variable on the stack. */ 1777 /* Set the NMI executing variable on the stack. */
1764 pushq_cfi $1 1778 pushq_cfi $1
1765 1779
1780 /*
1781 * Leave room for the "copied" frame
1782 */
1783 subq $(5*8), %rsp
1784
1766 /* Copy the stack frame to the Saved frame */ 1785 /* Copy the stack frame to the Saved frame */
1767 .rept 5 1786 .rept 5
1768 pushq_cfi 6*8(%rsp) 1787 pushq_cfi 11*8(%rsp)
1769 .endr 1788 .endr
1770 CFI_DEF_CFA_OFFSET SS+8-RIP 1789 CFI_DEF_CFA_OFFSET SS+8-RIP
1771 1790
@@ -1786,12 +1805,15 @@ repeat_nmi:
1786 * is benign for the non-repeat case, where 1 was pushed just above 1805 * is benign for the non-repeat case, where 1 was pushed just above
1787 * to this very stack slot). 1806 * to this very stack slot).
1788 */ 1807 */
1789 movq $1, 5*8(%rsp) 1808 movq $1, 10*8(%rsp)
1790 1809
1791 /* Make another copy, this one may be modified by nested NMIs */ 1810 /* Make another copy, this one may be modified by nested NMIs */
1811 addq $(10*8), %rsp
1812 CFI_ADJUST_CFA_OFFSET -10*8
1792 .rept 5 1813 .rept 5
1793 pushq_cfi 4*8(%rsp) 1814 pushq_cfi -6*8(%rsp)
1794 .endr 1815 .endr
1816 subq $(5*8), %rsp
1795 CFI_DEF_CFA_OFFSET SS+8-RIP 1817 CFI_DEF_CFA_OFFSET SS+8-RIP
1796end_repeat_nmi: 1818end_repeat_nmi:
1797 1819
@@ -1842,8 +1864,12 @@ nmi_swapgs:
1842 SWAPGS_UNSAFE_STACK 1864 SWAPGS_UNSAFE_STACK
1843nmi_restore: 1865nmi_restore:
1844 RESTORE_ALL 8 1866 RESTORE_ALL 8
1867
1868 /* Pop the extra iret frame */
1869 addq $(5*8), %rsp
1870
1845 /* Clear the NMI executing stack variable */ 1871 /* Clear the NMI executing stack variable */
1846 movq $0, 10*8(%rsp) 1872 movq $0, 5*8(%rsp)
1847 jmp irq_return 1873 jmp irq_return
1848 CFI_ENDPROC 1874 CFI_ENDPROC
1849END(nmi) 1875END(nmi)
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 957a47aec64e..8e7f6556028f 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -266,6 +266,19 @@ num_subarch_entries = (. - subarch_entries) / 4
266 jmp default_entry 266 jmp default_entry
267#endif /* CONFIG_PARAVIRT */ 267#endif /* CONFIG_PARAVIRT */
268 268
269#ifdef CONFIG_HOTPLUG_CPU
270/*
271 * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
272 * up already except stack. We just set up stack here. Then call
273 * start_secondary().
274 */
275ENTRY(start_cpu0)
276 movl stack_start, %ecx
277 movl %ecx, %esp
278 jmp *(initial_code)
279ENDPROC(start_cpu0)
280#endif
281
269/* 282/*
270 * Non-boot CPU entry point; entered from trampoline.S 283 * Non-boot CPU entry point; entered from trampoline.S
271 * We can't lgdt here, because lgdt itself uses a data segment, but 284 * We can't lgdt here, because lgdt itself uses a data segment, but
@@ -292,8 +305,8 @@ default_entry:
292 * be using the global pages. 305 * be using the global pages.
293 * 306 *
294 * NOTE! If we are on a 486 we may have no cr4 at all! 307 * NOTE! If we are on a 486 we may have no cr4 at all!
295 * Specifically, cr4 exists if and only if CPUID exists, 308 * Specifically, cr4 exists if and only if CPUID exists
296 * which in turn exists if and only if EFLAGS.ID exists. 309 * and has flags other than the FPU flag set.
297 */ 310 */
298 movl $X86_EFLAGS_ID,%ecx 311 movl $X86_EFLAGS_ID,%ecx
299 pushl %ecx 312 pushl %ecx
@@ -308,6 +321,11 @@ default_entry:
308 testl %ecx,%eax 321 testl %ecx,%eax
309 jz 6f # No ID flag = no CPUID = no CR4 322 jz 6f # No ID flag = no CPUID = no CR4
310 323
324 movl $1,%eax
325 cpuid
326 andl $~1,%edx # Ignore CPUID.FPU
327 jz 6f # No flags or only CPUID.FPU = no CR4
328
311 movl pa(mmu_cr4_features),%eax 329 movl pa(mmu_cr4_features),%eax
312 movl %eax,%cr4 330 movl %eax,%cr4
313 331
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 94bf9cc2c7ee..980053c4b9cc 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -252,6 +252,22 @@ ENTRY(secondary_startup_64)
252 pushq %rax # target address in negative space 252 pushq %rax # target address in negative space
253 lretq 253 lretq
254 254
255#ifdef CONFIG_HOTPLUG_CPU
256/*
257 * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
258 * up already except stack. We just set up stack here. Then call
259 * start_secondary().
260 */
261ENTRY(start_cpu0)
262 movq stack_start(%rip),%rsp
263 movq initial_code(%rip),%rax
264 pushq $0 # fake return address to stop unwinder
265 pushq $__KERNEL_CS # set correct cs
266 pushq %rax # target address in negative space
267 lretq
268ENDPROC(start_cpu0)
269#endif
270
255 /* SMP bootup changes these two */ 271 /* SMP bootup changes these two */
256 __REFDATA 272 __REFDATA
257 .align 8 273 .align 8
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 1460a5df92f7..e28670f9a589 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -434,7 +434,7 @@ void hpet_msi_unmask(struct irq_data *data)
434 434
435 /* unmask it */ 435 /* unmask it */
436 cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); 436 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
437 cfg |= HPET_TN_FSB; 437 cfg |= HPET_TN_ENABLE | HPET_TN_FSB;
438 hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); 438 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
439} 439}
440 440
@@ -445,7 +445,7 @@ void hpet_msi_mask(struct irq_data *data)
445 445
446 /* mask it */ 446 /* mask it */
447 cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); 447 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
448 cfg &= ~HPET_TN_FSB; 448 cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB);
449 hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); 449 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
450} 450}
451 451
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 675a05012449..245a71db401a 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -175,7 +175,11 @@ void __cpuinit fpu_init(void)
175 cr0 |= X86_CR0_EM; 175 cr0 |= X86_CR0_EM;
176 write_cr0(cr0); 176 write_cr0(cr0);
177 177
178 if (!smp_processor_id()) 178 /*
179 * init_thread_xstate is only called once to avoid overriding
180 * xstate_size during boot time or during CPU hotplug.
181 */
182 if (xstate_size == 0)
179 init_thread_xstate(); 183 init_thread_xstate();
180 184
181 mxcsr_feature_mask_init(); 185 mxcsr_feature_mask_init();
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 6e03b0d69138..7dc4e459c2b3 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -42,39 +42,6 @@
42 * (these are usually mapped into the 0x30-0xff vector range) 42 * (these are usually mapped into the 0x30-0xff vector range)
43 */ 43 */
44 44
45#ifdef CONFIG_X86_32
46/*
47 * Note that on a 486, we don't want to do a SIGFPE on an irq13
48 * as the irq is unreliable, and exception 16 works correctly
49 * (ie as explained in the intel literature). On a 386, you
50 * can't use exception 16 due to bad IBM design, so we have to
51 * rely on the less exact irq13.
52 *
53 * Careful.. Not only is IRQ13 unreliable, but it is also
54 * leads to races. IBM designers who came up with it should
55 * be shot.
56 */
57
58static irqreturn_t math_error_irq(int cpl, void *dev_id)
59{
60 outb(0, 0xF0);
61 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
62 return IRQ_NONE;
63 math_error(get_irq_regs(), 0, X86_TRAP_MF);
64 return IRQ_HANDLED;
65}
66
67/*
68 * New motherboards sometimes make IRQ 13 be a PCI interrupt,
69 * so allow interrupt sharing.
70 */
71static struct irqaction fpu_irq = {
72 .handler = math_error_irq,
73 .name = "fpu",
74 .flags = IRQF_NO_THREAD,
75};
76#endif
77
78/* 45/*
79 * IRQ2 is cascade interrupt to second interrupt controller 46 * IRQ2 is cascade interrupt to second interrupt controller
80 */ 47 */
@@ -242,13 +209,6 @@ void __init native_init_IRQ(void)
242 setup_irq(2, &irq2); 209 setup_irq(2, &irq2);
243 210
244#ifdef CONFIG_X86_32 211#ifdef CONFIG_X86_32
245 /*
246 * External FPU? Set up irq13 if so, for
247 * original braindamaged IBM FERR coupling.
248 */
249 if (boot_cpu_data.hard_math && !cpu_has_fpu)
250 setup_irq(FPU_IRQ, &fpu_irq);
251
252 irq_ctx_init(smp_processor_id()); 212 irq_ctx_init(smp_processor_id());
253#endif 213#endif
254} 214}
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 4180a874c764..9c2bd8bd4b4c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -42,6 +42,8 @@
42#include <asm/apic.h> 42#include <asm/apic.h>
43#include <asm/apicdef.h> 43#include <asm/apicdef.h>
44#include <asm/hypervisor.h> 44#include <asm/hypervisor.h>
45#include <asm/kvm_guest.h>
46#include <asm/context_tracking.h>
45 47
46static int kvmapf = 1; 48static int kvmapf = 1;
47 49
@@ -62,6 +64,15 @@ static int parse_no_stealacc(char *arg)
62 64
63early_param("no-steal-acc", parse_no_stealacc); 65early_param("no-steal-acc", parse_no_stealacc);
64 66
67static int kvmclock_vsyscall = 1;
68static int parse_no_kvmclock_vsyscall(char *arg)
69{
70 kvmclock_vsyscall = 0;
71 return 0;
72}
73
74early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
75
65static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 76static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
66static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); 77static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
67static int has_steal_clock = 0; 78static int has_steal_clock = 0;
@@ -110,11 +121,8 @@ void kvm_async_pf_task_wait(u32 token)
110 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 121 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
111 struct kvm_task_sleep_node n, *e; 122 struct kvm_task_sleep_node n, *e;
112 DEFINE_WAIT(wait); 123 DEFINE_WAIT(wait);
113 int cpu, idle;
114 124
115 cpu = get_cpu(); 125 rcu_irq_enter();
116 idle = idle_cpu(cpu);
117 put_cpu();
118 126
119 spin_lock(&b->lock); 127 spin_lock(&b->lock);
120 e = _find_apf_task(b, token); 128 e = _find_apf_task(b, token);
@@ -123,12 +131,14 @@ void kvm_async_pf_task_wait(u32 token)
123 hlist_del(&e->link); 131 hlist_del(&e->link);
124 kfree(e); 132 kfree(e);
125 spin_unlock(&b->lock); 133 spin_unlock(&b->lock);
134
135 rcu_irq_exit();
126 return; 136 return;
127 } 137 }
128 138
129 n.token = token; 139 n.token = token;
130 n.cpu = smp_processor_id(); 140 n.cpu = smp_processor_id();
131 n.halted = idle || preempt_count() > 1; 141 n.halted = is_idle_task(current) || preempt_count() > 1;
132 init_waitqueue_head(&n.wq); 142 init_waitqueue_head(&n.wq);
133 hlist_add_head(&n.link, &b->list); 143 hlist_add_head(&n.link, &b->list);
134 spin_unlock(&b->lock); 144 spin_unlock(&b->lock);
@@ -147,13 +157,16 @@ void kvm_async_pf_task_wait(u32 token)
147 /* 157 /*
148 * We cannot reschedule. So halt. 158 * We cannot reschedule. So halt.
149 */ 159 */
160 rcu_irq_exit();
150 native_safe_halt(); 161 native_safe_halt();
162 rcu_irq_enter();
151 local_irq_disable(); 163 local_irq_disable();
152 } 164 }
153 } 165 }
154 if (!n.halted) 166 if (!n.halted)
155 finish_wait(&n.wq, &wait); 167 finish_wait(&n.wq, &wait);
156 168
169 rcu_irq_exit();
157 return; 170 return;
158} 171}
159EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); 172EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
@@ -247,10 +260,10 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
247 break; 260 break;
248 case KVM_PV_REASON_PAGE_NOT_PRESENT: 261 case KVM_PV_REASON_PAGE_NOT_PRESENT:
249 /* page is swapped out by the host. */ 262 /* page is swapped out by the host. */
250 rcu_irq_enter(); 263 exception_enter(regs);
251 exit_idle(); 264 exit_idle();
252 kvm_async_pf_task_wait((u32)read_cr2()); 265 kvm_async_pf_task_wait((u32)read_cr2());
253 rcu_irq_exit(); 266 exception_exit(regs);
254 break; 267 break;
255 case KVM_PV_REASON_PAGE_READY: 268 case KVM_PV_REASON_PAGE_READY:
256 rcu_irq_enter(); 269 rcu_irq_enter();
@@ -471,6 +484,9 @@ void __init kvm_guest_init(void)
471 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 484 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
472 apic_set_eoi_write(kvm_guest_apic_eoi_write); 485 apic_set_eoi_write(kvm_guest_apic_eoi_write);
473 486
487 if (kvmclock_vsyscall)
488 kvm_setup_vsyscall_timeinfo();
489
474#ifdef CONFIG_SMP 490#ifdef CONFIG_SMP
475 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 491 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
476 register_cpu_notifier(&kvm_cpu_notifier); 492 register_cpu_notifier(&kvm_cpu_notifier);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f1b42b3a186c..220a360010f8 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -23,6 +23,7 @@
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/hardirq.h> 25#include <linux/hardirq.h>
26#include <linux/memblock.h>
26 27
27#include <asm/x86_init.h> 28#include <asm/x86_init.h>
28#include <asm/reboot.h> 29#include <asm/reboot.h>
@@ -39,7 +40,7 @@ static int parse_no_kvmclock(char *arg)
39early_param("no-kvmclock", parse_no_kvmclock); 40early_param("no-kvmclock", parse_no_kvmclock);
40 41
41/* The hypervisor will put information about time periodically here */ 42/* The hypervisor will put information about time periodically here */
42static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); 43static struct pvclock_vsyscall_time_info *hv_clock;
43static struct pvclock_wall_clock wall_clock; 44static struct pvclock_wall_clock wall_clock;
44 45
45/* 46/*
@@ -52,15 +53,20 @@ static unsigned long kvm_get_wallclock(void)
52 struct pvclock_vcpu_time_info *vcpu_time; 53 struct pvclock_vcpu_time_info *vcpu_time;
53 struct timespec ts; 54 struct timespec ts;
54 int low, high; 55 int low, high;
56 int cpu;
55 57
56 low = (int)__pa_symbol(&wall_clock); 58 low = (int)__pa_symbol(&wall_clock);
57 high = ((u64)__pa_symbol(&wall_clock) >> 32); 59 high = ((u64)__pa_symbol(&wall_clock) >> 32);
58 60
59 native_write_msr(msr_kvm_wall_clock, low, high); 61 native_write_msr(msr_kvm_wall_clock, low, high);
60 62
61 vcpu_time = &get_cpu_var(hv_clock); 63 preempt_disable();
64 cpu = smp_processor_id();
65
66 vcpu_time = &hv_clock[cpu].pvti;
62 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); 67 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
63 put_cpu_var(hv_clock); 68
69 preempt_enable();
64 70
65 return ts.tv_sec; 71 return ts.tv_sec;
66} 72}
@@ -74,9 +80,11 @@ static cycle_t kvm_clock_read(void)
74{ 80{
75 struct pvclock_vcpu_time_info *src; 81 struct pvclock_vcpu_time_info *src;
76 cycle_t ret; 82 cycle_t ret;
83 int cpu;
77 84
78 preempt_disable_notrace(); 85 preempt_disable_notrace();
79 src = &__get_cpu_var(hv_clock); 86 cpu = smp_processor_id();
87 src = &hv_clock[cpu].pvti;
80 ret = pvclock_clocksource_read(src); 88 ret = pvclock_clocksource_read(src);
81 preempt_enable_notrace(); 89 preempt_enable_notrace();
82 return ret; 90 return ret;
@@ -99,8 +107,15 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
99static unsigned long kvm_get_tsc_khz(void) 107static unsigned long kvm_get_tsc_khz(void)
100{ 108{
101 struct pvclock_vcpu_time_info *src; 109 struct pvclock_vcpu_time_info *src;
102 src = &per_cpu(hv_clock, 0); 110 int cpu;
103 return pvclock_tsc_khz(src); 111 unsigned long tsc_khz;
112
113 preempt_disable();
114 cpu = smp_processor_id();
115 src = &hv_clock[cpu].pvti;
116 tsc_khz = pvclock_tsc_khz(src);
117 preempt_enable();
118 return tsc_khz;
104} 119}
105 120
106static void kvm_get_preset_lpj(void) 121static void kvm_get_preset_lpj(void)
@@ -119,10 +134,14 @@ bool kvm_check_and_clear_guest_paused(void)
119{ 134{
120 bool ret = false; 135 bool ret = false;
121 struct pvclock_vcpu_time_info *src; 136 struct pvclock_vcpu_time_info *src;
137 int cpu = smp_processor_id();
122 138
123 src = &__get_cpu_var(hv_clock); 139 if (!hv_clock)
140 return ret;
141
142 src = &hv_clock[cpu].pvti;
124 if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { 143 if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
125 __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); 144 src->flags &= ~PVCLOCK_GUEST_STOPPED;
126 ret = true; 145 ret = true;
127 } 146 }
128 147
@@ -141,9 +160,10 @@ int kvm_register_clock(char *txt)
141{ 160{
142 int cpu = smp_processor_id(); 161 int cpu = smp_processor_id();
143 int low, high, ret; 162 int low, high, ret;
163 struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti;
144 164
145 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; 165 low = (int)__pa(src) | 1;
146 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 166 high = ((u64)__pa(src) >> 32);
147 ret = native_write_msr_safe(msr_kvm_system_time, low, high); 167 ret = native_write_msr_safe(msr_kvm_system_time, low, high);
148 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 168 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
149 cpu, high, low, txt); 169 cpu, high, low, txt);
@@ -197,6 +217,8 @@ static void kvm_shutdown(void)
197 217
198void __init kvmclock_init(void) 218void __init kvmclock_init(void)
199{ 219{
220 unsigned long mem;
221
200 if (!kvm_para_available()) 222 if (!kvm_para_available())
201 return; 223 return;
202 224
@@ -209,8 +231,18 @@ void __init kvmclock_init(void)
209 printk(KERN_INFO "kvm-clock: Using msrs %x and %x", 231 printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
210 msr_kvm_system_time, msr_kvm_wall_clock); 232 msr_kvm_system_time, msr_kvm_wall_clock);
211 233
212 if (kvm_register_clock("boot clock")) 234 mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS,
235 PAGE_SIZE);
236 if (!mem)
237 return;
238 hv_clock = __va(mem);
239
240 if (kvm_register_clock("boot clock")) {
241 hv_clock = NULL;
242 memblock_free(mem,
243 sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
213 return; 244 return;
245 }
214 pv_time_ops.sched_clock = kvm_clock_read; 246 pv_time_ops.sched_clock = kvm_clock_read;
215 x86_platform.calibrate_tsc = kvm_get_tsc_khz; 247 x86_platform.calibrate_tsc = kvm_get_tsc_khz;
216 x86_platform.get_wallclock = kvm_get_wallclock; 248 x86_platform.get_wallclock = kvm_get_wallclock;
@@ -233,3 +265,37 @@ void __init kvmclock_init(void)
233 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) 265 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
234 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); 266 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
235} 267}
268
269int __init kvm_setup_vsyscall_timeinfo(void)
270{
271#ifdef CONFIG_X86_64
272 int cpu;
273 int ret;
274 u8 flags;
275 struct pvclock_vcpu_time_info *vcpu_time;
276 unsigned int size;
277
278 size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS;
279
280 preempt_disable();
281 cpu = smp_processor_id();
282
283 vcpu_time = &hv_clock[cpu].pvti;
284 flags = pvclock_read_flags(vcpu_time);
285
286 if (!(flags & PVCLOCK_TSC_STABLE_BIT)) {
287 preempt_enable();
288 return 1;
289 }
290
291 if ((ret = pvclock_init_vsyscall(hv_clock, size))) {
292 preempt_enable();
293 return ret;
294 }
295
296 preempt_enable();
297
298 kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
299#endif
300 return 0;
301}
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 7720ff5a9ee2..efdec7cd8e01 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -8,8 +8,8 @@
8 * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> 8 * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
9 * 9 *
10 * Maintainers: 10 * Maintainers:
11 * Andreas Herrmann <andreas.herrmann3@amd.com> 11 * Andreas Herrmann <herrmann.der.user@googlemail.com>
12 * Borislav Petkov <borislav.petkov@amd.com> 12 * Borislav Petkov <bp@alien8.de>
13 * 13 *
14 * This driver allows to upgrade microcode on F10h AMD 14 * This driver allows to upgrade microcode on F10h AMD
15 * CPUs and later. 15 * CPUs and later.
@@ -190,6 +190,7 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size,
190#define F1XH_MPB_MAX_SIZE 2048 190#define F1XH_MPB_MAX_SIZE 2048
191#define F14H_MPB_MAX_SIZE 1824 191#define F14H_MPB_MAX_SIZE 1824
192#define F15H_MPB_MAX_SIZE 4096 192#define F15H_MPB_MAX_SIZE 4096
193#define F16H_MPB_MAX_SIZE 3458
193 194
194 switch (c->x86) { 195 switch (c->x86) {
195 case 0x14: 196 case 0x14:
@@ -198,6 +199,9 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size,
198 case 0x15: 199 case 0x15:
199 max_size = F15H_MPB_MAX_SIZE; 200 max_size = F15H_MPB_MAX_SIZE;
200 break; 201 break;
202 case 0x16:
203 max_size = F16H_MPB_MAX_SIZE;
204 break;
201 default: 205 default:
202 max_size = F1XH_MPB_MAX_SIZE; 206 max_size = F1XH_MPB_MAX_SIZE;
203 break; 207 break;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index de2b7ad70273..0f5dec5c80e0 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -265,7 +265,7 @@ rootfs_initcall(pci_iommu_init);
265#ifdef CONFIG_PCI 265#ifdef CONFIG_PCI
266/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ 266/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
267 267
268static __devinit void via_no_dac(struct pci_dev *dev) 268static void via_no_dac(struct pci_dev *dev)
269{ 269{
270 if (forbid_dac == 0) { 270 if (forbid_dac == 0) {
271 dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); 271 dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b644e1c765dc..2ed787f15bf0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -262,36 +262,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
262 propagate_user_return_notify(prev_p, next_p); 262 propagate_user_return_notify(prev_p, next_p);
263} 263}
264 264
265int sys_fork(struct pt_regs *regs)
266{
267 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
268}
269
270/*
271 * This is trivial, and on the face of it looks like it
272 * could equally well be done in user mode.
273 *
274 * Not so, for quite unobvious reasons - register pressure.
275 * In user mode vfork() cannot have a stack frame, and if
276 * done by calling the "clone()" system call directly, you
277 * do not have enough call-clobbered registers to hold all
278 * the information you need.
279 */
280int sys_vfork(struct pt_regs *regs)
281{
282 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
283 NULL, NULL);
284}
285
286long
287sys_clone(unsigned long clone_flags, unsigned long newsp,
288 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
289{
290 if (!newsp)
291 newsp = regs->sp;
292 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
293}
294
295/* 265/*
296 * Idle related variables and functions 266 * Idle related variables and functions
297 */ 267 */
@@ -306,11 +276,6 @@ void (*pm_idle)(void);
306EXPORT_SYMBOL(pm_idle); 276EXPORT_SYMBOL(pm_idle);
307#endif 277#endif
308 278
309static inline int hlt_use_halt(void)
310{
311 return 1;
312}
313
314#ifndef CONFIG_SMP 279#ifndef CONFIG_SMP
315static inline void play_dead(void) 280static inline void play_dead(void)
316{ 281{
@@ -410,28 +375,22 @@ void cpu_idle(void)
410 */ 375 */
411void default_idle(void) 376void default_idle(void)
412{ 377{
413 if (hlt_use_halt()) { 378 trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
414 trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); 379 trace_cpu_idle_rcuidle(1, smp_processor_id());
415 trace_cpu_idle_rcuidle(1, smp_processor_id()); 380 current_thread_info()->status &= ~TS_POLLING;
416 current_thread_info()->status &= ~TS_POLLING; 381 /*
417 /* 382 * TS_POLLING-cleared state must be visible before we
418 * TS_POLLING-cleared state must be visible before we 383 * test NEED_RESCHED:
419 * test NEED_RESCHED: 384 */
420 */ 385 smp_mb();
421 smp_mb();
422 386
423 if (!need_resched()) 387 if (!need_resched())
424 safe_halt(); /* enables interrupts racelessly */ 388 safe_halt(); /* enables interrupts racelessly */
425 else 389 else
426 local_irq_enable();
427 current_thread_info()->status |= TS_POLLING;
428 trace_power_end_rcuidle(smp_processor_id());
429 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
430 } else {
431 local_irq_enable(); 390 local_irq_enable();
432 /* loop is done by the caller */ 391 current_thread_info()->status |= TS_POLLING;
433 cpu_relax(); 392 trace_power_end_rcuidle(smp_processor_id());
434 } 393 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
435} 394}
436#ifdef CONFIG_APM_MODULE 395#ifdef CONFIG_APM_MODULE
437EXPORT_SYMBOL(default_idle); 396EXPORT_SYMBOL(default_idle);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 44e0bff38e72..b5a8905785e6 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -128,8 +128,7 @@ void release_thread(struct task_struct *dead_task)
128} 128}
129 129
130int copy_thread(unsigned long clone_flags, unsigned long sp, 130int copy_thread(unsigned long clone_flags, unsigned long sp,
131 unsigned long arg, 131 unsigned long arg, struct task_struct *p)
132 struct task_struct *p, struct pt_regs *regs)
133{ 132{
134 struct pt_regs *childregs = task_pt_regs(p); 133 struct pt_regs *childregs = task_pt_regs(p);
135 struct task_struct *tsk; 134 struct task_struct *tsk;
@@ -138,7 +137,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
138 p->thread.sp = (unsigned long) childregs; 137 p->thread.sp = (unsigned long) childregs;
139 p->thread.sp0 = (unsigned long) (childregs+1); 138 p->thread.sp0 = (unsigned long) (childregs+1);
140 139
141 if (unlikely(!regs)) { 140 if (unlikely(p->flags & PF_KTHREAD)) {
142 /* kernel thread */ 141 /* kernel thread */
143 memset(childregs, 0, sizeof(struct pt_regs)); 142 memset(childregs, 0, sizeof(struct pt_regs));
144 p->thread.ip = (unsigned long) ret_from_kernel_thread; 143 p->thread.ip = (unsigned long) ret_from_kernel_thread;
@@ -156,12 +155,13 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
156 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 155 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
157 return 0; 156 return 0;
158 } 157 }
159 *childregs = *regs; 158 *childregs = *current_pt_regs();
160 childregs->ax = 0; 159 childregs->ax = 0;
161 childregs->sp = sp; 160 if (sp)
161 childregs->sp = sp;
162 162
163 p->thread.ip = (unsigned long) ret_from_fork; 163 p->thread.ip = (unsigned long) ret_from_fork;
164 task_user_gs(p) = get_user_gs(regs); 164 task_user_gs(p) = get_user_gs(current_pt_regs());
165 165
166 p->fpu_counter = 0; 166 p->fpu_counter = 0;
167 p->thread.io_bitmap_ptr = NULL; 167 p->thread.io_bitmap_ptr = NULL;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 16c6365e2b86..6e68a6194965 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -146,8 +146,7 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls)
146} 146}
147 147
148int copy_thread(unsigned long clone_flags, unsigned long sp, 148int copy_thread(unsigned long clone_flags, unsigned long sp,
149 unsigned long arg, 149 unsigned long arg, struct task_struct *p)
150 struct task_struct *p, struct pt_regs *regs)
151{ 150{
152 int err; 151 int err;
153 struct pt_regs *childregs; 152 struct pt_regs *childregs;
@@ -169,7 +168,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
169 savesegment(ds, p->thread.ds); 168 savesegment(ds, p->thread.ds);
170 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 169 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
171 170
172 if (unlikely(!regs)) { 171 if (unlikely(p->flags & PF_KTHREAD)) {
173 /* kernel thread */ 172 /* kernel thread */
174 memset(childregs, 0, sizeof(struct pt_regs)); 173 memset(childregs, 0, sizeof(struct pt_regs));
175 childregs->sp = (unsigned long)childregs; 174 childregs->sp = (unsigned long)childregs;
@@ -181,10 +180,11 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
181 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1; 180 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
182 return 0; 181 return 0;
183 } 182 }
184 *childregs = *regs; 183 *childregs = *current_pt_regs();
185 184
186 childregs->ax = 0; 185 childregs->ax = 0;
187 childregs->sp = sp; 186 if (sp)
187 childregs->sp = sp;
188 188
189 err = -ENOMEM; 189 err = -ENOMEM;
190 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 190 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index b00b33a18390..b629bbe0d9bd 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,8 @@
22#include <linux/perf_event.h> 22#include <linux/perf_event.h>
23#include <linux/hw_breakpoint.h> 23#include <linux/hw_breakpoint.h>
24#include <linux/rcupdate.h> 24#include <linux/rcupdate.h>
25#include <linux/module.h>
26#include <linux/context_tracking.h>
25 27
26#include <asm/uaccess.h> 28#include <asm/uaccess.h>
27#include <asm/pgtable.h> 29#include <asm/pgtable.h>
@@ -166,6 +168,35 @@ static inline bool invalid_selector(u16 value)
166 168
167#define FLAG_MASK FLAG_MASK_32 169#define FLAG_MASK FLAG_MASK_32
168 170
171/*
172 * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
173 * when it traps. The previous stack will be directly underneath the saved
174 * registers, and 'sp/ss' won't even have been saved. Thus the '&regs->sp'.
175 *
176 * Now, if the stack is empty, '&regs->sp' is out of range. In this
177 * case we try to take the previous stack. To always return a non-null
178 * stack pointer we fall back to regs as stack if no previous stack
179 * exists.
180 *
181 * This is valid only for kernel mode traps.
182 */
183unsigned long kernel_stack_pointer(struct pt_regs *regs)
184{
185 unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1);
186 unsigned long sp = (unsigned long)&regs->sp;
187 struct thread_info *tinfo;
188
189 if (context == (sp & ~(THREAD_SIZE - 1)))
190 return sp;
191
192 tinfo = (struct thread_info *)context;
193 if (tinfo->previous_esp)
194 return tinfo->previous_esp;
195
196 return (unsigned long)regs;
197}
198EXPORT_SYMBOL_GPL(kernel_stack_pointer);
199
169static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) 200static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
170{ 201{
171 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); 202 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
@@ -1461,7 +1492,7 @@ long syscall_trace_enter(struct pt_regs *regs)
1461{ 1492{
1462 long ret = 0; 1493 long ret = 0;
1463 1494
1464 rcu_user_exit(); 1495 user_exit();
1465 1496
1466 /* 1497 /*
1467 * If we stepped into a sysenter/syscall insn, it trapped in 1498 * If we stepped into a sysenter/syscall insn, it trapped in
@@ -1511,6 +1542,13 @@ void syscall_trace_leave(struct pt_regs *regs)
1511{ 1542{
1512 bool step; 1543 bool step;
1513 1544
1545 /*
1546 * We may come here right after calling schedule_user()
1547 * or do_notify_resume(), in which case we can be in RCU
1548 * user mode.
1549 */
1550 user_exit();
1551
1514 audit_syscall_exit(regs); 1552 audit_syscall_exit(regs);
1515 1553
1516 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1554 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
@@ -1527,5 +1565,5 @@ void syscall_trace_leave(struct pt_regs *regs)
1527 if (step || test_thread_flag(TIF_SYSCALL_TRACE)) 1565 if (step || test_thread_flag(TIF_SYSCALL_TRACE))
1528 tracehook_report_syscall_exit(regs, step); 1566 tracehook_report_syscall_exit(regs, step);
1529 1567
1530 rcu_user_enter(); 1568 user_enter();
1531} 1569}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 42eb3300dfc6..85c39590c1a4 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -17,23 +17,13 @@
17 17
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/notifier.h>
21#include <linux/sched.h>
22#include <linux/gfp.h>
23#include <linux/bootmem.h>
24#include <asm/fixmap.h>
20#include <asm/pvclock.h> 25#include <asm/pvclock.h>
21 26
22/*
23 * These are perodically updated
24 * xen: magic shared_info page
25 * kvm: gpa registered via msr
26 * and then copied here.
27 */
28struct pvclock_shadow_time {
29 u64 tsc_timestamp; /* TSC at last update of time vals. */
30 u64 system_timestamp; /* Time, in nanosecs, since boot. */
31 u32 tsc_to_nsec_mul;
32 int tsc_shift;
33 u32 version;
34 u8 flags;
35};
36
37static u8 valid_flags __read_mostly = 0; 27static u8 valid_flags __read_mostly = 0;
38 28
39void pvclock_set_flags(u8 flags) 29void pvclock_set_flags(u8 flags)
@@ -41,34 +31,6 @@ void pvclock_set_flags(u8 flags)
41 valid_flags = flags; 31 valid_flags = flags;
42} 32}
43 33
44static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
45{
46 u64 delta = native_read_tsc() - shadow->tsc_timestamp;
47 return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
48 shadow->tsc_shift);
49}
50
51/*
52 * Reads a consistent set of time-base values from hypervisor,
53 * into a shadow data area.
54 */
55static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
56 struct pvclock_vcpu_time_info *src)
57{
58 do {
59 dst->version = src->version;
60 rmb(); /* fetch version before data */
61 dst->tsc_timestamp = src->tsc_timestamp;
62 dst->system_timestamp = src->system_time;
63 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
64 dst->tsc_shift = src->tsc_shift;
65 dst->flags = src->flags;
66 rmb(); /* test version after fetching data */
67 } while ((src->version & 1) || (dst->version != src->version));
68
69 return dst->version;
70}
71
72unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) 34unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
73{ 35{
74 u64 pv_tsc_khz = 1000000ULL << 32; 36 u64 pv_tsc_khz = 1000000ULL << 32;
@@ -88,23 +50,32 @@ void pvclock_resume(void)
88 atomic64_set(&last_value, 0); 50 atomic64_set(&last_value, 0);
89} 51}
90 52
53u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
54{
55 unsigned version;
56 cycle_t ret;
57 u8 flags;
58
59 do {
60 version = __pvclock_read_cycles(src, &ret, &flags);
61 } while ((src->version & 1) || version != src->version);
62
63 return flags & valid_flags;
64}
65
91cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 66cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
92{ 67{
93 struct pvclock_shadow_time shadow;
94 unsigned version; 68 unsigned version;
95 cycle_t ret, offset; 69 cycle_t ret;
96 u64 last; 70 u64 last;
71 u8 flags;
97 72
98 do { 73 do {
99 version = pvclock_get_time_values(&shadow, src); 74 version = __pvclock_read_cycles(src, &ret, &flags);
100 barrier(); 75 } while ((src->version & 1) || version != src->version);
101 offset = pvclock_get_nsec_offset(&shadow);
102 ret = shadow.system_timestamp + offset;
103 barrier();
104 } while (version != src->version);
105 76
106 if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && 77 if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
107 (shadow.flags & PVCLOCK_TSC_STABLE_BIT)) 78 (flags & PVCLOCK_TSC_STABLE_BIT))
108 return ret; 79 return ret;
109 80
110 /* 81 /*
@@ -156,3 +127,71 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
156 127
157 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 128 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
158} 129}
130
131static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
132
133static struct pvclock_vsyscall_time_info *
134pvclock_get_vsyscall_user_time_info(int cpu)
135{
136 if (!pvclock_vdso_info) {
137 BUG();
138 return NULL;
139 }
140
141 return &pvclock_vdso_info[cpu];
142}
143
144struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
145{
146 return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
147}
148
149#ifdef CONFIG_X86_64
150static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
151 void *v)
152{
153 struct task_migration_notifier *mn = v;
154 struct pvclock_vsyscall_time_info *pvti;
155
156 pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
157
158 /* this is NULL when pvclock vsyscall is not initialized */
159 if (unlikely(pvti == NULL))
160 return NOTIFY_DONE;
161
162 pvti->migrate_count++;
163
164 return NOTIFY_DONE;
165}
166
167static struct notifier_block pvclock_migrate = {
168 .notifier_call = pvclock_task_migrate,
169};
170
171/*
172 * Initialize the generic pvclock vsyscall state. This will allocate
173 * a/some page(s) for the per-vcpu pvclock information, set up a
174 * fixmap mapping for the page(s)
175 */
176
177int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
178 int size)
179{
180 int idx;
181
182 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
183
184 pvclock_vdso_info = i;
185
186 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
187 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
188 __pa_symbol(i) + (idx*PAGE_SIZE),
189 PAGE_KERNEL_VVAR);
190 }
191
192
193 register_task_migration_notifier(&pvclock_migrate);
194
195 return 0;
196}
197#endif
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 1b27de563561..26ee48a33dc4 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -8,7 +8,7 @@
8 8
9#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) 9#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
10 10
11static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) 11static void quirk_intel_irqbalance(struct pci_dev *dev)
12{ 12{
13 u8 config; 13 u8 config;
14 u16 word; 14 u16 word;
@@ -512,7 +512,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
512 512
513#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) 513#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
514/* Set correct numa_node information for AMD NB functions */ 514/* Set correct numa_node information for AMD NB functions */
515static void __devinit quirk_amd_nb_node(struct pci_dev *dev) 515static void quirk_amd_nb_node(struct pci_dev *dev)
516{ 516{
517 struct pci_dev *nb_ht; 517 struct pci_dev *nb_ht;
518 unsigned int devfn; 518 unsigned int devfn;
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 4929c1be0ac0..801602b5d745 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -195,12 +195,6 @@ void read_persistent_clock(struct timespec *ts)
195 ts->tv_nsec = 0; 195 ts->tv_nsec = 0;
196} 196}
197 197
198unsigned long long native_read_tsc(void)
199{
200 return __native_read_tsc();
201}
202EXPORT_SYMBOL(native_read_tsc);
203
204 198
205static struct resource rtc_resources[] = { 199static struct resource rtc_resources[] = {
206 [0] = { 200 [0] = {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2702c5d4acd2..8354399b3aae 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -143,11 +143,7 @@ int default_check_phys_apicid_present(int phys_apicid)
143} 143}
144#endif 144#endif
145 145
146#ifndef CONFIG_DEBUG_BOOT_PARAMS
147struct boot_params __initdata boot_params;
148#else
149struct boot_params boot_params; 146struct boot_params boot_params;
150#endif
151 147
152/* 148/*
153 * Machine setup.. 149 * Machine setup..
@@ -614,6 +610,83 @@ static __init void reserve_ibft_region(void)
614 610
615static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; 611static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
616 612
613static bool __init snb_gfx_workaround_needed(void)
614{
615#ifdef CONFIG_PCI
616 int i;
617 u16 vendor, devid;
618 static const __initconst u16 snb_ids[] = {
619 0x0102,
620 0x0112,
621 0x0122,
622 0x0106,
623 0x0116,
624 0x0126,
625 0x010a,
626 };
627
628 /* Assume no if something weird is going on with PCI */
629 if (!early_pci_allowed())
630 return false;
631
632 vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID);
633 if (vendor != 0x8086)
634 return false;
635
636 devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID);
637 for (i = 0; i < ARRAY_SIZE(snb_ids); i++)
638 if (devid == snb_ids[i])
639 return true;
640#endif
641
642 return false;
643}
644
645/*
646 * Sandy Bridge graphics has trouble with certain ranges, exclude
647 * them from allocation.
648 */
649static void __init trim_snb_memory(void)
650{
651 static const __initconst unsigned long bad_pages[] = {
652 0x20050000,
653 0x20110000,
654 0x20130000,
655 0x20138000,
656 0x40004000,
657 };
658 int i;
659
660 if (!snb_gfx_workaround_needed())
661 return;
662
663 printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n");
664
665 /*
666 * Reserve all memory below the 1 MB mark that has not
667 * already been reserved.
668 */
669 memblock_reserve(0, 1<<20);
670
671 for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
672 if (memblock_reserve(bad_pages[i], PAGE_SIZE))
673 printk(KERN_WARNING "failed to reserve 0x%08lx\n",
674 bad_pages[i]);
675 }
676}
677
678/*
679 * Here we put platform-specific memory range workarounds, i.e.
680 * memory known to be corrupt or otherwise in need to be reserved on
681 * specific platforms.
682 *
683 * If this gets used more widely it could use a real dispatch mechanism.
684 */
685static void __init trim_platform_memory_ranges(void)
686{
687 trim_snb_memory();
688}
689
617static void __init trim_bios_range(void) 690static void __init trim_bios_range(void)
618{ 691{
619 /* 692 /*
@@ -634,6 +707,7 @@ static void __init trim_bios_range(void)
634 * take them out. 707 * take them out.
635 */ 708 */
636 e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); 709 e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
710
637 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 711 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
638} 712}
639 713
@@ -912,6 +986,8 @@ void __init setup_arch(char **cmdline_p)
912 986
913 setup_real_mode(); 987 setup_real_mode();
914 988
989 trim_platform_memory_ranges();
990
915 init_gbpages(); 991 init_gbpages();
916 992
917 /* max_pfn_mapped is updated here */ 993 /* max_pfn_mapped is updated here */
@@ -956,6 +1032,10 @@ void __init setup_arch(char **cmdline_p)
956 1032
957 reserve_initrd(); 1033 reserve_initrd();
958 1034
1035#if defined(CONFIG_ACPI) && defined(CONFIG_BLK_DEV_INITRD)
1036 acpi_initrd_override((void *)initrd_start, initrd_end - initrd_start);
1037#endif
1038
959 reserve_crashkernel(); 1039 reserve_crashkernel();
960 1040
961 vsmp_init(); 1041 vsmp_init();
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 70b27ee6118e..d6bf1f34a6e9 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -22,6 +22,7 @@
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/user-return-notifier.h> 23#include <linux/user-return-notifier.h>
24#include <linux/uprobes.h> 24#include <linux/uprobes.h>
25#include <linux/context_tracking.h>
25 26
26#include <asm/processor.h> 27#include <asm/processor.h>
27#include <asm/ucontext.h> 28#include <asm/ucontext.h>
@@ -363,10 +364,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
363 else 364 else
364 put_user_ex(0, &frame->uc.uc_flags); 365 put_user_ex(0, &frame->uc.uc_flags);
365 put_user_ex(0, &frame->uc.uc_link); 366 put_user_ex(0, &frame->uc.uc_link);
366 put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 367 err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
367 put_user_ex(sas_ss_flags(regs->sp),
368 &frame->uc.uc_stack.ss_flags);
369 put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
370 368
371 /* Set up to return from userspace. */ 369 /* Set up to return from userspace. */
372 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); 370 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -413,7 +411,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
413 struct rt_sigframe __user *frame; 411 struct rt_sigframe __user *frame;
414 void __user *fp = NULL; 412 void __user *fp = NULL;
415 int err = 0; 413 int err = 0;
416 struct task_struct *me = current;
417 414
418 frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp); 415 frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp);
419 416
@@ -432,10 +429,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
432 else 429 else
433 put_user_ex(0, &frame->uc.uc_flags); 430 put_user_ex(0, &frame->uc.uc_flags);
434 put_user_ex(0, &frame->uc.uc_link); 431 put_user_ex(0, &frame->uc.uc_link);
435 put_user_ex(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 432 err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
436 put_user_ex(sas_ss_flags(regs->sp),
437 &frame->uc.uc_stack.ss_flags);
438 put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
439 433
440 /* Set up to return from userspace. If provided, use a stub 434 /* Set up to return from userspace. If provided, use a stub
441 already in userspace. */ 435 already in userspace. */
@@ -502,10 +496,7 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
502 else 496 else
503 put_user_ex(0, &frame->uc.uc_flags); 497 put_user_ex(0, &frame->uc.uc_flags);
504 put_user_ex(0, &frame->uc.uc_link); 498 put_user_ex(0, &frame->uc.uc_link);
505 put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 499 err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp);
506 put_user_ex(sas_ss_flags(regs->sp),
507 &frame->uc.uc_stack.ss_flags);
508 put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
509 put_user_ex(0, &frame->uc.uc__pad0); 500 put_user_ex(0, &frame->uc.uc__pad0);
510 501
511 if (ka->sa.sa_flags & SA_RESTORER) { 502 if (ka->sa.sa_flags & SA_RESTORER) {
@@ -602,13 +593,6 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
602} 593}
603#endif /* CONFIG_X86_32 */ 594#endif /* CONFIG_X86_32 */
604 595
605long
606sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
607 struct pt_regs *regs)
608{
609 return do_sigaltstack(uss, uoss, regs->sp);
610}
611
612/* 596/*
613 * Do a signal return; undo the signal stack. 597 * Do a signal return; undo the signal stack.
614 */ 598 */
@@ -658,7 +642,7 @@ long sys_rt_sigreturn(struct pt_regs *regs)
658 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 642 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
659 goto badframe; 643 goto badframe;
660 644
661 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) 645 if (restore_altstack(&frame->uc.uc_stack))
662 goto badframe; 646 goto badframe;
663 647
664 return ax; 648 return ax;
@@ -816,7 +800,7 @@ static void do_signal(struct pt_regs *regs)
816void 800void
817do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 801do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
818{ 802{
819 rcu_user_exit(); 803 user_exit();
820 804
821#ifdef CONFIG_X86_MCE 805#ifdef CONFIG_X86_MCE
822 /* notify userspace of pending MCEs */ 806 /* notify userspace of pending MCEs */
@@ -838,7 +822,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
838 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) 822 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
839 fire_user_return_notifiers(); 823 fire_user_return_notifiers();
840 824
841 rcu_user_enter(); 825 user_enter();
842} 826}
843 827
844void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 828void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
@@ -864,7 +848,6 @@ asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs)
864 struct rt_sigframe_x32 __user *frame; 848 struct rt_sigframe_x32 __user *frame;
865 sigset_t set; 849 sigset_t set;
866 unsigned long ax; 850 unsigned long ax;
867 struct pt_regs tregs;
868 851
869 frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); 852 frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
870 853
@@ -878,8 +861,7 @@ asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs)
878 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 861 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
879 goto badframe; 862 goto badframe;
880 863
881 tregs = *regs; 864 if (compat_restore_altstack(&frame->uc.uc_stack))
882 if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT)
883 goto badframe; 865 goto badframe;
884 866
885 return ax; 867 return ax;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c80a33bc528b..ed0fe385289d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -68,6 +68,8 @@
68#include <asm/mwait.h> 68#include <asm/mwait.h>
69#include <asm/apic.h> 69#include <asm/apic.h>
70#include <asm/io_apic.h> 70#include <asm/io_apic.h>
71#include <asm/i387.h>
72#include <asm/fpu-internal.h>
71#include <asm/setup.h> 73#include <asm/setup.h>
72#include <asm/uv/uv.h> 74#include <asm/uv/uv.h>
73#include <linux/mc146818rtc.h> 75#include <linux/mc146818rtc.h>
@@ -125,8 +127,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
125atomic_t init_deasserted; 127atomic_t init_deasserted;
126 128
127/* 129/*
128 * Report back to the Boot Processor. 130 * Report back to the Boot Processor during boot time or to the caller processor
129 * Running on AP. 131 * during CPU online.
130 */ 132 */
131static void __cpuinit smp_callin(void) 133static void __cpuinit smp_callin(void)
132{ 134{
@@ -138,15 +140,17 @@ static void __cpuinit smp_callin(void)
138 * we may get here before an INIT-deassert IPI reaches 140 * we may get here before an INIT-deassert IPI reaches
139 * our local APIC. We have to wait for the IPI or we'll 141 * our local APIC. We have to wait for the IPI or we'll
140 * lock up on an APIC access. 142 * lock up on an APIC access.
143 *
144 * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI.
141 */ 145 */
142 if (apic->wait_for_init_deassert) 146 cpuid = smp_processor_id();
147 if (apic->wait_for_init_deassert && cpuid != 0)
143 apic->wait_for_init_deassert(&init_deasserted); 148 apic->wait_for_init_deassert(&init_deasserted);
144 149
145 /* 150 /*
146 * (This works even if the APIC is not enabled.) 151 * (This works even if the APIC is not enabled.)
147 */ 152 */
148 phys_id = read_apic_id(); 153 phys_id = read_apic_id();
149 cpuid = smp_processor_id();
150 if (cpumask_test_cpu(cpuid, cpu_callin_mask)) { 154 if (cpumask_test_cpu(cpuid, cpu_callin_mask)) {
151 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, 155 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
152 phys_id, cpuid); 156 phys_id, cpuid);
@@ -228,6 +232,8 @@ static void __cpuinit smp_callin(void)
228 cpumask_set_cpu(cpuid, cpu_callin_mask); 232 cpumask_set_cpu(cpuid, cpu_callin_mask);
229} 233}
230 234
235static int cpu0_logical_apicid;
236static int enable_start_cpu0;
231/* 237/*
232 * Activate a secondary processor. 238 * Activate a secondary processor.
233 */ 239 */
@@ -243,6 +249,8 @@ notrace static void __cpuinit start_secondary(void *unused)
243 preempt_disable(); 249 preempt_disable();
244 smp_callin(); 250 smp_callin();
245 251
252 enable_start_cpu0 = 0;
253
246#ifdef CONFIG_X86_32 254#ifdef CONFIG_X86_32
247 /* switch away from the initial page table */ 255 /* switch away from the initial page table */
248 load_cr3(swapper_pg_dir); 256 load_cr3(swapper_pg_dir);
@@ -279,19 +287,30 @@ notrace static void __cpuinit start_secondary(void *unused)
279 cpu_idle(); 287 cpu_idle();
280} 288}
281 289
290void __init smp_store_boot_cpu_info(void)
291{
292 int id = 0; /* CPU 0 */
293 struct cpuinfo_x86 *c = &cpu_data(id);
294
295 *c = boot_cpu_data;
296 c->cpu_index = id;
297}
298
282/* 299/*
283 * The bootstrap kernel entry code has set these up. Save them for 300 * The bootstrap kernel entry code has set these up. Save them for
284 * a given CPU 301 * a given CPU
285 */ 302 */
286
287void __cpuinit smp_store_cpu_info(int id) 303void __cpuinit smp_store_cpu_info(int id)
288{ 304{
289 struct cpuinfo_x86 *c = &cpu_data(id); 305 struct cpuinfo_x86 *c = &cpu_data(id);
290 306
291 *c = boot_cpu_data; 307 *c = boot_cpu_data;
292 c->cpu_index = id; 308 c->cpu_index = id;
293 if (id != 0) 309 /*
294 identify_secondary_cpu(c); 310 * During boot time, CPU0 has this setup already. Save the info when
311 * bringing up AP or offlined CPU0.
312 */
313 identify_secondary_cpu(c);
295} 314}
296 315
297static bool __cpuinit 316static bool __cpuinit
@@ -313,7 +332,7 @@ do { \
313 332
314static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) 333static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
315{ 334{
316 if (cpu_has(c, X86_FEATURE_TOPOEXT)) { 335 if (cpu_has_topoext) {
317 int cpu1 = c->cpu_index, cpu2 = o->cpu_index; 336 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
318 337
319 if (c->phys_proc_id == o->phys_proc_id && 338 if (c->phys_proc_id == o->phys_proc_id &&
@@ -481,7 +500,7 @@ void __inquire_remote_apic(int apicid)
481 * won't ... remember to clear down the APIC, etc later. 500 * won't ... remember to clear down the APIC, etc later.
482 */ 501 */
483int __cpuinit 502int __cpuinit
484wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) 503wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
485{ 504{
486 unsigned long send_status, accept_status = 0; 505 unsigned long send_status, accept_status = 0;
487 int maxlvt; 506 int maxlvt;
@@ -489,7 +508,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
489 /* Target chip */ 508 /* Target chip */
490 /* Boot on the stack */ 509 /* Boot on the stack */
491 /* Kick the second */ 510 /* Kick the second */
492 apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid); 511 apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid);
493 512
494 pr_debug("Waiting for send to finish...\n"); 513 pr_debug("Waiting for send to finish...\n");
495 send_status = safe_apic_wait_icr_idle(); 514 send_status = safe_apic_wait_icr_idle();
@@ -649,6 +668,63 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
649 node, cpu, apicid); 668 node, cpu, apicid);
650} 669}
651 670
671static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs)
672{
673 int cpu;
674
675 cpu = smp_processor_id();
676 if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0)
677 return NMI_HANDLED;
678
679 return NMI_DONE;
680}
681
682/*
683 * Wake up AP by INIT, INIT, STARTUP sequence.
684 *
685 * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS
686 * boot-strap code which is not a desired behavior for waking up BSP. To
687 * void the boot-strap code, wake up CPU0 by NMI instead.
688 *
689 * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined
690 * (i.e. physically hot removed and then hot added), NMI won't wake it up.
691 * We'll change this code in the future to wake up hard offlined CPU0 if
692 * real platform and request are available.
693 */
694static int __cpuinit
695wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
696 int *cpu0_nmi_registered)
697{
698 int id;
699 int boot_error;
700
701 /*
702 * Wake up AP by INIT, INIT, STARTUP sequence.
703 */
704 if (cpu)
705 return wakeup_secondary_cpu_via_init(apicid, start_ip);
706
707 /*
708 * Wake up BSP by nmi.
709 *
710 * Register a NMI handler to help wake up CPU0.
711 */
712 boot_error = register_nmi_handler(NMI_LOCAL,
713 wakeup_cpu0_nmi, 0, "wake_cpu0");
714
715 if (!boot_error) {
716 enable_start_cpu0 = 1;
717 *cpu0_nmi_registered = 1;
718 if (apic->dest_logical == APIC_DEST_LOGICAL)
719 id = cpu0_logical_apicid;
720 else
721 id = apicid;
722 boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
723 }
724
725 return boot_error;
726}
727
652/* 728/*
653 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 729 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
654 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 730 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -664,6 +740,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
664 740
665 unsigned long boot_error = 0; 741 unsigned long boot_error = 0;
666 int timeout; 742 int timeout;
743 int cpu0_nmi_registered = 0;
667 744
668 /* Just in case we booted with a single CPU. */ 745 /* Just in case we booted with a single CPU. */
669 alternatives_enable_smp(); 746 alternatives_enable_smp();
@@ -711,13 +788,16 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
711 } 788 }
712 789
713 /* 790 /*
714 * Kick the secondary CPU. Use the method in the APIC driver 791 * Wake up a CPU in difference cases:
715 * if it's defined - or use an INIT boot APIC message otherwise: 792 * - Use the method in the APIC driver if it's defined
793 * Otherwise,
794 * - Use an INIT boot APIC message for APs or NMI for BSP.
716 */ 795 */
717 if (apic->wakeup_secondary_cpu) 796 if (apic->wakeup_secondary_cpu)
718 boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); 797 boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
719 else 798 else
720 boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip); 799 boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
800 &cpu0_nmi_registered);
721 801
722 if (!boot_error) { 802 if (!boot_error) {
723 /* 803 /*
@@ -782,6 +862,13 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
782 */ 862 */
783 smpboot_restore_warm_reset_vector(); 863 smpboot_restore_warm_reset_vector();
784 } 864 }
865 /*
866 * Clean up the nmi handler. Do this after the callin and callout sync
867 * to avoid impact of possible long unregister time.
868 */
869 if (cpu0_nmi_registered)
870 unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
871
785 return boot_error; 872 return boot_error;
786} 873}
787 874
@@ -795,7 +882,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
795 882
796 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); 883 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
797 884
798 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || 885 if (apicid == BAD_APICID ||
799 !physid_isset(apicid, phys_cpu_present_map) || 886 !physid_isset(apicid, phys_cpu_present_map) ||
800 !apic->apic_id_valid(apicid)) { 887 !apic->apic_id_valid(apicid)) {
801 pr_err("%s: bad cpu %d\n", __func__, cpu); 888 pr_err("%s: bad cpu %d\n", __func__, cpu);
@@ -818,6 +905,9 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
818 905
819 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 906 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
820 907
908 /* the FPU context is blank, nobody can own it */
909 __cpu_disable_lazy_restore(cpu);
910
821 err = do_boot_cpu(apicid, cpu, tidle); 911 err = do_boot_cpu(apicid, cpu, tidle);
822 if (err) { 912 if (err) {
823 pr_debug("do_boot_cpu failed %d\n", err); 913 pr_debug("do_boot_cpu failed %d\n", err);
@@ -990,7 +1080,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
990 /* 1080 /*
991 * Setup boot CPU information 1081 * Setup boot CPU information
992 */ 1082 */
993 smp_store_cpu_info(0); /* Final full version of the data */ 1083 smp_store_boot_cpu_info(); /* Final full version of the data */
994 cpumask_copy(cpu_callin_mask, cpumask_of(0)); 1084 cpumask_copy(cpu_callin_mask, cpumask_of(0));
995 mb(); 1085 mb();
996 1086
@@ -1026,6 +1116,11 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1026 */ 1116 */
1027 setup_local_APIC(); 1117 setup_local_APIC();
1028 1118
1119 if (x2apic_mode)
1120 cpu0_logical_apicid = apic_read(APIC_LDR);
1121 else
1122 cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
1123
1029 /* 1124 /*
1030 * Enable IO APIC before setting up error vector 1125 * Enable IO APIC before setting up error vector
1031 */ 1126 */
@@ -1214,19 +1309,6 @@ void cpu_disable_common(void)
1214 1309
1215int native_cpu_disable(void) 1310int native_cpu_disable(void)
1216{ 1311{
1217 int cpu = smp_processor_id();
1218
1219 /*
1220 * Perhaps use cpufreq to drop frequency, but that could go
1221 * into generic code.
1222 *
1223 * We won't take down the boot processor on i386 due to some
1224 * interrupts only being able to be serviced by the BSP.
1225 * Especially so if we're not using an IOAPIC -zwane
1226 */
1227 if (cpu == 0)
1228 return -EBUSY;
1229
1230 clear_local_APIC(); 1312 clear_local_APIC();
1231 1313
1232 cpu_disable_common(); 1314 cpu_disable_common();
@@ -1266,6 +1348,14 @@ void play_dead_common(void)
1266 local_irq_disable(); 1348 local_irq_disable();
1267} 1349}
1268 1350
1351static bool wakeup_cpu0(void)
1352{
1353 if (smp_processor_id() == 0 && enable_start_cpu0)
1354 return true;
1355
1356 return false;
1357}
1358
1269/* 1359/*
1270 * We need to flush the caches before going to sleep, lest we have 1360 * We need to flush the caches before going to sleep, lest we have
1271 * dirty data in our caches when we come back up. 1361 * dirty data in our caches when we come back up.
@@ -1329,6 +1419,11 @@ static inline void mwait_play_dead(void)
1329 __monitor(mwait_ptr, 0, 0); 1419 __monitor(mwait_ptr, 0, 0);
1330 mb(); 1420 mb();
1331 __mwait(eax, 0); 1421 __mwait(eax, 0);
1422 /*
1423 * If NMI wants to wake up CPU0, start CPU0.
1424 */
1425 if (wakeup_cpu0())
1426 start_cpu0();
1332 } 1427 }
1333} 1428}
1334 1429
@@ -1339,6 +1434,11 @@ static inline void hlt_play_dead(void)
1339 1434
1340 while (1) { 1435 while (1) {
1341 native_halt(); 1436 native_halt();
1437 /*
1438 * If NMI wants to wake up CPU0, start CPU0.
1439 */
1440 if (wakeup_cpu0())
1441 start_cpu0();
1342 } 1442 }
1343} 1443}
1344 1444
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index cd3b2438a980..9b4d51d0c0d0 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -165,10 +165,11 @@ void set_task_blockstep(struct task_struct *task, bool on)
165 * Ensure irq/preemption can't change debugctl in between. 165 * Ensure irq/preemption can't change debugctl in between.
166 * Note also that both TIF_BLOCKSTEP and debugctl should 166 * Note also that both TIF_BLOCKSTEP and debugctl should
167 * be changed atomically wrt preemption. 167 * be changed atomically wrt preemption.
168 * FIXME: this means that set/clear TIF_BLOCKSTEP is simply 168 *
169 * wrong if task != current, SIGKILL can wakeup the stopped 169 * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if
170 * tracee and set/clear can play with the running task, this 170 * task is current or it can't be running, otherwise we can race
171 * can confuse the next __switch_to_xtra(). 171 * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but
172 * PTRACE_KILL is not safe.
172 */ 173 */
173 local_irq_disable(); 174 local_irq_disable();
174 debugctl = get_debugctlmsr(); 175 debugctl = get_debugctlmsr();
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index b4d3c3927dd8..97ef74b88e0f 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -21,37 +21,23 @@
21 21
22/* 22/*
23 * Align a virtual address to avoid aliasing in the I$ on AMD F15h. 23 * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
24 *
25 * @flags denotes the allocation direction - bottomup or topdown -
26 * or vDSO; see call sites below.
27 */ 24 */
28unsigned long align_addr(unsigned long addr, struct file *filp, 25static unsigned long get_align_mask(void)
29 enum align_flags flags)
30{ 26{
31 unsigned long tmp_addr;
32
33 /* handle 32- and 64-bit case with a single conditional */ 27 /* handle 32- and 64-bit case with a single conditional */
34 if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) 28 if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
35 return addr; 29 return 0;
36 30
37 if (!(current->flags & PF_RANDOMIZE)) 31 if (!(current->flags & PF_RANDOMIZE))
38 return addr; 32 return 0;
39
40 if (!((flags & ALIGN_VDSO) || filp))
41 return addr;
42
43 tmp_addr = addr;
44
45 /*
46 * We need an address which is <= than the original
47 * one only when in topdown direction.
48 */
49 if (!(flags & ALIGN_TOPDOWN))
50 tmp_addr += va_align.mask;
51 33
52 tmp_addr &= ~va_align.mask; 34 return va_align.mask;
35}
53 36
54 return tmp_addr; 37unsigned long align_vdso_addr(unsigned long addr)
38{
39 unsigned long align_mask = get_align_mask();
40 return (addr + align_mask) & ~align_mask;
55} 41}
56 42
57static int __init control_va_addr_alignment(char *str) 43static int __init control_va_addr_alignment(char *str)
@@ -126,7 +112,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
126{ 112{
127 struct mm_struct *mm = current->mm; 113 struct mm_struct *mm = current->mm;
128 struct vm_area_struct *vma; 114 struct vm_area_struct *vma;
129 unsigned long start_addr; 115 struct vm_unmapped_area_info info;
130 unsigned long begin, end; 116 unsigned long begin, end;
131 117
132 if (flags & MAP_FIXED) 118 if (flags & MAP_FIXED)
@@ -144,50 +130,16 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
144 (!vma || addr + len <= vma->vm_start)) 130 (!vma || addr + len <= vma->vm_start))
145 return addr; 131 return addr;
146 } 132 }
147 if (((flags & MAP_32BIT) || test_thread_flag(TIF_ADDR32))
148 && len <= mm->cached_hole_size) {
149 mm->cached_hole_size = 0;
150 mm->free_area_cache = begin;
151 }
152 addr = mm->free_area_cache;
153 if (addr < begin)
154 addr = begin;
155 start_addr = addr;
156
157full_search:
158
159 addr = align_addr(addr, filp, 0);
160
161 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
162 /* At this point: (!vma || addr < vma->vm_end). */
163 if (end - len < addr) {
164 /*
165 * Start a new search - just in case we missed
166 * some holes.
167 */
168 if (start_addr != begin) {
169 start_addr = addr = begin;
170 mm->cached_hole_size = 0;
171 goto full_search;
172 }
173 return -ENOMEM;
174 }
175 if (!vma || addr + len <= vma->vm_start) {
176 /*
177 * Remember the place where we stopped the search:
178 */
179 mm->free_area_cache = addr + len;
180 return addr;
181 }
182 if (addr + mm->cached_hole_size < vma->vm_start)
183 mm->cached_hole_size = vma->vm_start - addr;
184 133
185 addr = vma->vm_end; 134 info.flags = 0;
186 addr = align_addr(addr, filp, 0); 135 info.length = len;
187 } 136 info.low_limit = begin;
137 info.high_limit = end;
138 info.align_mask = filp ? get_align_mask() : 0;
139 info.align_offset = pgoff << PAGE_SHIFT;
140 return vm_unmapped_area(&info);
188} 141}
189 142
190
191unsigned long 143unsigned long
192arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, 144arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
193 const unsigned long len, const unsigned long pgoff, 145 const unsigned long len, const unsigned long pgoff,
@@ -195,7 +147,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
195{ 147{
196 struct vm_area_struct *vma; 148 struct vm_area_struct *vma;
197 struct mm_struct *mm = current->mm; 149 struct mm_struct *mm = current->mm;
198 unsigned long addr = addr0, start_addr; 150 unsigned long addr = addr0;
151 struct vm_unmapped_area_info info;
199 152
200 /* requested length too big for entire address space */ 153 /* requested length too big for entire address space */
201 if (len > TASK_SIZE) 154 if (len > TASK_SIZE)
@@ -217,51 +170,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
217 return addr; 170 return addr;
218 } 171 }
219 172
220 /* check if free_area_cache is useful for us */ 173 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
221 if (len <= mm->cached_hole_size) { 174 info.length = len;
222 mm->cached_hole_size = 0; 175 info.low_limit = PAGE_SIZE;
223 mm->free_area_cache = mm->mmap_base; 176 info.high_limit = mm->mmap_base;
224 } 177 info.align_mask = filp ? get_align_mask() : 0;
225 178 info.align_offset = pgoff << PAGE_SHIFT;
226try_again: 179 addr = vm_unmapped_area(&info);
227 /* either no address requested or can't fit in requested address hole */ 180 if (!(addr & ~PAGE_MASK))
228 start_addr = addr = mm->free_area_cache; 181 return addr;
229 182 VM_BUG_ON(addr != -ENOMEM);
230 if (addr < len)
231 goto fail;
232
233 addr -= len;
234 do {
235 addr = align_addr(addr, filp, ALIGN_TOPDOWN);
236
237 /*
238 * Lookup failure means no vma is above this address,
239 * else if new region fits below vma->vm_start,
240 * return with success:
241 */
242 vma = find_vma(mm, addr);
243 if (!vma || addr+len <= vma->vm_start)
244 /* remember the address as a hint for next time */
245 return mm->free_area_cache = addr;
246
247 /* remember the largest hole we saw so far */
248 if (addr + mm->cached_hole_size < vma->vm_start)
249 mm->cached_hole_size = vma->vm_start - addr;
250
251 /* try just below the current vma->vm_start */
252 addr = vma->vm_start-len;
253 } while (len < vma->vm_start);
254
255fail:
256 /*
257 * if hint left us with no space for the requested
258 * mapping then try again:
259 */
260 if (start_addr != mm->mmap_base) {
261 mm->free_area_cache = mm->mmap_base;
262 mm->cached_hole_size = 0;
263 goto try_again;
264 }
265 183
266bottomup: 184bottomup:
267 /* 185 /*
@@ -270,14 +188,5 @@ bottomup:
270 * can happen with large stack limits and large mmap() 188 * can happen with large stack limits and large mmap()
271 * allocations. 189 * allocations.
272 */ 190 */
273 mm->cached_hole_size = ~0UL; 191 return arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
274 mm->free_area_cache = TASK_UNMAPPED_BASE;
275 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
276 /*
277 * Restore the topdown base:
278 */
279 mm->free_area_cache = mm->mmap_base;
280 mm->cached_hole_size = ~0UL;
281
282 return addr;
283} 192}
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 76ee97709a00..6e60b5fe2244 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -30,23 +30,110 @@
30#include <linux/mmzone.h> 30#include <linux/mmzone.h>
31#include <linux/init.h> 31#include <linux/init.h>
32#include <linux/smp.h> 32#include <linux/smp.h>
33#include <linux/irq.h>
33#include <asm/cpu.h> 34#include <asm/cpu.h>
34 35
35static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); 36static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
36 37
37#ifdef CONFIG_HOTPLUG_CPU 38#ifdef CONFIG_HOTPLUG_CPU
39
40#ifdef CONFIG_BOOTPARAM_HOTPLUG_CPU0
41static int cpu0_hotpluggable = 1;
42#else
43static int cpu0_hotpluggable;
44static int __init enable_cpu0_hotplug(char *str)
45{
46 cpu0_hotpluggable = 1;
47 return 1;
48}
49
50__setup("cpu0_hotplug", enable_cpu0_hotplug);
51#endif
52
53#ifdef CONFIG_DEBUG_HOTPLUG_CPU0
54/*
55 * This function offlines a CPU as early as possible and allows userspace to
56 * boot up without the CPU. The CPU can be onlined back by user after boot.
57 *
58 * This is only called for debugging CPU offline/online feature.
59 */
60int __ref _debug_hotplug_cpu(int cpu, int action)
61{
62 struct device *dev = get_cpu_device(cpu);
63 int ret;
64
65 if (!cpu_is_hotpluggable(cpu))
66 return -EINVAL;
67
68 cpu_hotplug_driver_lock();
69
70 switch (action) {
71 case 0:
72 ret = cpu_down(cpu);
73 if (!ret) {
74 pr_info("CPU %u is now offline\n", cpu);
75 kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
76 } else
77 pr_debug("Can't offline CPU%d.\n", cpu);
78 break;
79 case 1:
80 ret = cpu_up(cpu);
81 if (!ret)
82 kobject_uevent(&dev->kobj, KOBJ_ONLINE);
83 else
84 pr_debug("Can't online CPU%d.\n", cpu);
85 break;
86 default:
87 ret = -EINVAL;
88 }
89
90 cpu_hotplug_driver_unlock();
91
92 return ret;
93}
94
95static int __init debug_hotplug_cpu(void)
96{
97 _debug_hotplug_cpu(0, 0);
98 return 0;
99}
100
101late_initcall_sync(debug_hotplug_cpu);
102#endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */
103
38int __ref arch_register_cpu(int num) 104int __ref arch_register_cpu(int num)
39{ 105{
106 struct cpuinfo_x86 *c = &cpu_data(num);
107
108 /*
109 * Currently CPU0 is only hotpluggable on Intel platforms. Other
110 * vendors can add hotplug support later.
111 */
112 if (c->x86_vendor != X86_VENDOR_INTEL)
113 cpu0_hotpluggable = 0;
114
40 /* 115 /*
41 * CPU0 cannot be offlined due to several 116 * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate
42 * restrictions and assumptions in kernel. This basically 117 * depends on BSP. PIC interrupts depend on BSP.
43 * doesn't add a control file, one cannot attempt to offline
44 * BSP.
45 * 118 *
46 * Also certain PCI quirks require not to enable hotplug control 119 * If the BSP depencies are under control, one can tell kernel to
47 * for all CPU's. 120 * enable BSP hotplug. This basically adds a control file and
121 * one can attempt to offline BSP.
48 */ 122 */
49 if (num) 123 if (num == 0 && cpu0_hotpluggable) {
124 unsigned int irq;
125 /*
126 * We won't take down the boot processor on i386 if some
127 * interrupts only are able to be serviced by the BSP in PIC.
128 */
129 for_each_active_irq(irq) {
130 if (!IO_APIC_IRQ(irq) && irq_has_action(irq)) {
131 cpu0_hotpluggable = 0;
132 break;
133 }
134 }
135 }
136 if (num || cpu0_hotpluggable)
50 per_cpu(cpu_devices, num).cpu.hotpluggable = 1; 137 per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
51 138
52 return register_cpu(&per_cpu(cpu_devices, num).cpu, num); 139 return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
new file mode 100644
index 000000000000..25b993729f9b
--- /dev/null
+++ b/arch/x86/kernel/trace_clock.c
@@ -0,0 +1,21 @@
1/*
2 * X86 trace clocks
3 */
4#include <asm/trace_clock.h>
5#include <asm/barrier.h>
6#include <asm/msr.h>
7
8/*
9 * trace_clock_x86_tsc(): A clock that is just the cycle counter.
10 *
11 * Unlike the other clocks, this is not in nanoseconds.
12 */
13u64 notrace trace_clock_x86_tsc(void)
14{
15 u64 ret;
16
17 rdtsc_barrier();
18 rdtscll(ret);
19
20 return ret;
21}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8276dc6794cc..ecffca11f4e9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -55,7 +55,7 @@
55#include <asm/i387.h> 55#include <asm/i387.h>
56#include <asm/fpu-internal.h> 56#include <asm/fpu-internal.h>
57#include <asm/mce.h> 57#include <asm/mce.h>
58#include <asm/rcu.h> 58#include <asm/context_tracking.h>
59 59
60#include <asm/mach_traps.h> 60#include <asm/mach_traps.h>
61 61
@@ -69,9 +69,6 @@
69 69
70asmlinkage int system_call(void); 70asmlinkage int system_call(void);
71 71
72/* Do we ignore FPU interrupts ? */
73char ignore_fpu_irq;
74
75/* 72/*
76 * The IDT has to be page-aligned to simplify the Pentium 73 * The IDT has to be page-aligned to simplify the Pentium
77 * F0 0F bug workaround. 74 * F0 0F bug workaround.
@@ -564,9 +561,6 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
564 561
565dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) 562dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
566{ 563{
567#ifdef CONFIG_X86_32
568 ignore_fpu_irq = 1;
569#endif
570 exception_enter(regs); 564 exception_enter(regs);
571 math_error(regs, error_code, X86_TRAP_MF); 565 math_error(regs, error_code, X86_TRAP_MF);
572 exception_exit(regs); 566 exception_exit(regs);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cfa5d4f7ca56..06ccb5073a3f 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -77,6 +77,12 @@ unsigned long long
77sched_clock(void) __attribute__((alias("native_sched_clock"))); 77sched_clock(void) __attribute__((alias("native_sched_clock")));
78#endif 78#endif
79 79
80unsigned long long native_read_tsc(void)
81{
82 return __native_read_tsc();
83}
84EXPORT_SYMBOL(native_read_tsc);
85
80int check_tsc_unstable(void) 86int check_tsc_unstable(void)
81{ 87{
82 return tsc_unstable; 88 return tsc_unstable;
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index aafa5557b396..c71025b67462 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -478,6 +478,11 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
478 regs->ip = current->utask->xol_vaddr; 478 regs->ip = current->utask->xol_vaddr;
479 pre_xol_rip_insn(auprobe, regs, autask); 479 pre_xol_rip_insn(auprobe, regs, autask);
480 480
481 autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF);
482 regs->flags |= X86_EFLAGS_TF;
483 if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
484 set_task_blockstep(current, false);
485
481 return 0; 486 return 0;
482} 487}
483 488
@@ -603,6 +608,16 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
603 if (auprobe->fixups & UPROBE_FIX_CALL) 608 if (auprobe->fixups & UPROBE_FIX_CALL)
604 result = adjust_ret_addr(regs->sp, correction); 609 result = adjust_ret_addr(regs->sp, correction);
605 610
611 /*
612 * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP
613 * so we can get an extra SIGTRAP if we do not clear TF. We need
614 * to examine the opcode to make it right.
615 */
616 if (utask->autask.saved_tf)
617 send_sig(SIGTRAP, current, 0);
618 else if (!(auprobe->fixups & UPROBE_FIX_SETF))
619 regs->flags &= ~X86_EFLAGS_TF;
620
606 return result; 621 return result;
607} 622}
608 623
@@ -647,6 +662,10 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
647 current->thread.trap_nr = utask->autask.saved_trap_nr; 662 current->thread.trap_nr = utask->autask.saved_trap_nr;
648 handle_riprel_post_xol(auprobe, regs, NULL); 663 handle_riprel_post_xol(auprobe, regs, NULL);
649 instruction_pointer_set(regs, utask->vaddr); 664 instruction_pointer_set(regs, utask->vaddr);
665
666 /* clear TF if it was set by us in arch_uprobe_pre_xol() */
667 if (!utask->autask.saved_tf)
668 regs->flags &= ~X86_EFLAGS_TF;
650} 669}
651 670
652/* 671/*
@@ -676,38 +695,3 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
676 send_sig(SIGTRAP, current, 0); 695 send_sig(SIGTRAP, current, 0);
677 return ret; 696 return ret;
678} 697}
679
680void arch_uprobe_enable_step(struct arch_uprobe *auprobe)
681{
682 struct task_struct *task = current;
683 struct arch_uprobe_task *autask = &task->utask->autask;
684 struct pt_regs *regs = task_pt_regs(task);
685
686 autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF);
687
688 regs->flags |= X86_EFLAGS_TF;
689 if (test_tsk_thread_flag(task, TIF_BLOCKSTEP))
690 set_task_blockstep(task, false);
691}
692
693void arch_uprobe_disable_step(struct arch_uprobe *auprobe)
694{
695 struct task_struct *task = current;
696 struct arch_uprobe_task *autask = &task->utask->autask;
697 bool trapped = (task->utask->state == UTASK_SSTEP_TRAPPED);
698 struct pt_regs *regs = task_pt_regs(task);
699 /*
700 * The state of TIF_BLOCKSTEP was not saved so we can get an extra
701 * SIGTRAP if we do not clear TF. We need to examine the opcode to
702 * make it right.
703 */
704 if (unlikely(trapped)) {
705 if (!autask->saved_tf)
706 regs->flags &= ~X86_EFLAGS_TF;
707 } else {
708 if (autask->saved_tf)
709 send_sig(SIGTRAP, task, 0);
710 else if (!(auprobe->fixups & UPROBE_FIX_SETF))
711 regs->flags &= ~X86_EFLAGS_TF;
712 }
713}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 5c9687b1bde6..1dfe69cc78a8 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -182,7 +182,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
182 if (pud_none_or_clear_bad(pud)) 182 if (pud_none_or_clear_bad(pud))
183 goto out; 183 goto out;
184 pmd = pmd_offset(pud, 0xA0000); 184 pmd = pmd_offset(pud, 0xA0000);
185 split_huge_page_pmd(mm, pmd); 185 split_huge_page_pmd_mm(mm, 0xA0000, pmd);
186 if (pmd_none_or_clear_bad(pmd)) 186 if (pmd_none_or_clear_bad(pmd))
187 goto out; 187 goto out;
188 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); 188 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3a3e8c9e280d..9a907a67be8f 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -145,19 +145,6 @@ static int addr_to_vsyscall_nr(unsigned long addr)
145 return nr; 145 return nr;
146} 146}
147 147
148#ifdef CONFIG_SECCOMP
149static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
150{
151 if (!seccomp_mode(&tsk->seccomp))
152 return 0;
153 task_pt_regs(tsk)->orig_ax = syscall_nr;
154 task_pt_regs(tsk)->ax = syscall_nr;
155 return __secure_computing(syscall_nr);
156}
157#else
158#define vsyscall_seccomp(_tsk, _nr) 0
159#endif
160
161static bool write_ok_or_segv(unsigned long ptr, size_t size) 148static bool write_ok_or_segv(unsigned long ptr, size_t size)
162{ 149{
163 /* 150 /*
@@ -190,10 +177,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
190{ 177{
191 struct task_struct *tsk; 178 struct task_struct *tsk;
192 unsigned long caller; 179 unsigned long caller;
193 int vsyscall_nr; 180 int vsyscall_nr, syscall_nr, tmp;
194 int prev_sig_on_uaccess_error; 181 int prev_sig_on_uaccess_error;
195 long ret; 182 long ret;
196 int skip;
197 183
198 /* 184 /*
199 * No point in checking CS -- the only way to get here is a user mode 185 * No point in checking CS -- the only way to get here is a user mode
@@ -225,56 +211,84 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
225 } 211 }
226 212
227 tsk = current; 213 tsk = current;
228 /*
229 * With a real vsyscall, page faults cause SIGSEGV. We want to
230 * preserve that behavior to make writing exploits harder.
231 */
232 prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
233 current_thread_info()->sig_on_uaccess_error = 1;
234 214
235 /* 215 /*
216 * Check for access_ok violations and find the syscall nr.
217 *
236 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and 218 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
237 * 64-bit, so we don't need to special-case it here. For all the 219 * 64-bit, so we don't need to special-case it here. For all the
238 * vsyscalls, NULL means "don't write anything" not "write it at 220 * vsyscalls, NULL means "don't write anything" not "write it at
239 * address 0". 221 * address 0".
240 */ 222 */
241 ret = -EFAULT;
242 skip = 0;
243 switch (vsyscall_nr) { 223 switch (vsyscall_nr) {
244 case 0: 224 case 0:
245 skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
246 if (skip)
247 break;
248
249 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || 225 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
250 !write_ok_or_segv(regs->si, sizeof(struct timezone))) 226 !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
251 break; 227 ret = -EFAULT;
228 goto check_fault;
229 }
230
231 syscall_nr = __NR_gettimeofday;
232 break;
233
234 case 1:
235 if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
236 ret = -EFAULT;
237 goto check_fault;
238 }
239
240 syscall_nr = __NR_time;
241 break;
242
243 case 2:
244 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
245 !write_ok_or_segv(regs->si, sizeof(unsigned))) {
246 ret = -EFAULT;
247 goto check_fault;
248 }
249
250 syscall_nr = __NR_getcpu;
251 break;
252 }
253
254 /*
255 * Handle seccomp. regs->ip must be the original value.
256 * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
257 *
258 * We could optimize the seccomp disabled case, but performance
259 * here doesn't matter.
260 */
261 regs->orig_ax = syscall_nr;
262 regs->ax = -ENOSYS;
263 tmp = secure_computing(syscall_nr);
264 if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
265 warn_bad_vsyscall(KERN_DEBUG, regs,
266 "seccomp tried to change syscall nr or ip");
267 do_exit(SIGSYS);
268 }
269 if (tmp)
270 goto do_ret; /* skip requested */
252 271
272 /*
273 * With a real vsyscall, page faults cause SIGSEGV. We want to
274 * preserve that behavior to make writing exploits harder.
275 */
276 prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
277 current_thread_info()->sig_on_uaccess_error = 1;
278
279 ret = -EFAULT;
280 switch (vsyscall_nr) {
281 case 0:
253 ret = sys_gettimeofday( 282 ret = sys_gettimeofday(
254 (struct timeval __user *)regs->di, 283 (struct timeval __user *)regs->di,
255 (struct timezone __user *)regs->si); 284 (struct timezone __user *)regs->si);
256 break; 285 break;
257 286
258 case 1: 287 case 1:
259 skip = vsyscall_seccomp(tsk, __NR_time);
260 if (skip)
261 break;
262
263 if (!write_ok_or_segv(regs->di, sizeof(time_t)))
264 break;
265
266 ret = sys_time((time_t __user *)regs->di); 288 ret = sys_time((time_t __user *)regs->di);
267 break; 289 break;
268 290
269 case 2: 291 case 2:
270 skip = vsyscall_seccomp(tsk, __NR_getcpu);
271 if (skip)
272 break;
273
274 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
275 !write_ok_or_segv(regs->si, sizeof(unsigned)))
276 break;
277
278 ret = sys_getcpu((unsigned __user *)regs->di, 292 ret = sys_getcpu((unsigned __user *)regs->di,
279 (unsigned __user *)regs->si, 293 (unsigned __user *)regs->si,
280 NULL); 294 NULL);
@@ -283,12 +297,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
283 297
284 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; 298 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
285 299
286 if (skip) { 300check_fault:
287 if ((long)regs->ax <= 0L) /* seccomp errno emulation */
288 goto do_ret;
289 goto done; /* seccomp trace/trap */
290 }
291
292 if (ret == -EFAULT) { 301 if (ret == -EFAULT) {
293 /* Bad news -- userspace fed a bad pointer to a vsyscall. */ 302 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
294 warn_bad_vsyscall(KERN_INFO, regs, 303 warn_bad_vsyscall(KERN_INFO, regs,
@@ -311,7 +320,6 @@ do_ret:
311 /* Emulate a ret instruction. */ 320 /* Emulate a ret instruction. */
312 regs->ip = caller; 321 regs->ip = caller;
313 regs->sp += 8; 322 regs->sp += 8;
314done:
315 return true; 323 return true;
316 324
317sigsegv: 325sigsegv: