aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c2
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c2
-rw-r--r--arch/x86/kernel/apic/apic_noop.c1
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c10
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c1
-rw-r--r--arch/x86/kernel/apic/es7000_32.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c40
-rw-r--r--arch/x86/kernel/apic/numaq_32.c1
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/summit_32.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c1
-rw-r--r--arch/x86/kernel/apm_32.c11
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c3
-rw-r--r--arch/x86/kernel/cpu/common.c19
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c44
-rw-r--r--arch/x86/kernel/cpu/match.c91
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c195
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c11
-rw-r--r--arch/x86/kernel/cpu/perf_event.c171
-rw-r--r--arch/x86/kernel/cpu/perf_event.h58
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c40
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c158
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c23
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c528
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/crash_dump_32.c6
-rw-r--r--arch/x86/kernel/devicetree.c101
-rw-r--r--arch/x86/kernel/dumpstack.c3
-rw-r--r--arch/x86/kernel/dumpstack_32.c2
-rw-r--r--arch/x86/kernel/dumpstack_64.c8
-rw-r--r--arch/x86/kernel/entry_32.S17
-rw-r--r--arch/x86/kernel/entry_64.S82
-rw-r--r--arch/x86/kernel/i387.c83
-rw-r--r--arch/x86/kernel/irq_32.c11
-rw-r--r--arch/x86/kernel/irqinit.c6
-rw-r--r--arch/x86/kernel/kgdb.c6
-rw-r--r--arch/x86/kernel/kprobes-common.h102
-rw-r--r--arch/x86/kernel/kprobes-opt.c512
-rw-r--r--arch/x86/kernel/kprobes.c664
-rw-r--r--arch/x86/kernel/kvm.c4
-rw-r--r--arch/x86/kernel/kvmclock.c15
-rw-r--r--arch/x86/kernel/microcode_amd.c25
-rw-r--r--arch/x86/kernel/microcode_core.c15
-rw-r--r--arch/x86/kernel/nmi_selftest.c37
-rw-r--r--arch/x86/kernel/paravirt.c5
-rw-r--r--arch/x86/kernel/pci-dma.c5
-rw-r--r--arch/x86/kernel/probe_roms.c1
-rw-r--r--arch/x86/kernel/process.c25
-rw-r--r--arch/x86/kernel/process_32.c31
-rw-r--r--arch/x86/kernel/process_64.c36
-rw-r--r--arch/x86/kernel/ptrace.c1
-rw-r--r--arch/x86/kernel/reboot.c36
-rw-r--r--arch/x86/kernel/setup.c10
-rw-r--r--arch/x86/kernel/signal.c1
-rw-r--r--arch/x86/kernel/smpboot.c23
-rw-r--r--arch/x86/kernel/sys_x86_64.c34
-rw-r--r--arch/x86/kernel/time.c3
-rw-r--r--arch/x86/kernel/traps.c44
-rw-r--r--arch/x86/kernel/tsc.c7
-rw-r--r--arch/x86/kernel/tsc_sync.c29
-rw-r--r--arch/x86/kernel/vm86_32.c2
-rw-r--r--arch/x86/kernel/x86_init.c5
-rw-r--r--arch/x86/kernel/xsave.c13
68 files changed, 2349 insertions, 1106 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5369059c07a9..532d2e090e6f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
71obj-$(CONFIG_KPROBES) += kprobes.o 71obj-$(CONFIG_KPROBES) += kprobes.o
72obj-$(CONFIG_OPTPROBES) += kprobes-opt.o
72obj-$(CONFIG_MODULES) += module.o 73obj-$(CONFIG_MODULES) += module.o
73obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o 74obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
74obj-$(CONFIG_KGDB) += kgdb.o 75obj-$(CONFIG_KGDB) += kgdb.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ce664f33ea8e..406ed77216d0 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -593,7 +593,7 @@ void __init acpi_set_irq_model_ioapic(void)
593#ifdef CONFIG_ACPI_HOTPLUG_CPU 593#ifdef CONFIG_ACPI_HOTPLUG_CPU
594#include <acpi/processor.h> 594#include <acpi/processor.h>
595 595
596static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) 596static void __cpuinitdata acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
597{ 597{
598#ifdef CONFIG_ACPI_NUMA 598#ifdef CONFIG_ACPI_NUMA
599 int nid; 599 int nid;
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 8c3cdded6f2b..359b6899a36c 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -180,6 +180,7 @@ static struct apic apic_flat = {
180 .name = "flat", 180 .name = "flat",
181 .probe = flat_probe, 181 .probe = flat_probe,
182 .acpi_madt_oem_check = flat_acpi_madt_oem_check, 182 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
183 .apic_id_valid = default_apic_id_valid,
183 .apic_id_registered = flat_apic_id_registered, 184 .apic_id_registered = flat_apic_id_registered,
184 185
185 .irq_delivery_mode = dest_LowestPrio, 186 .irq_delivery_mode = dest_LowestPrio,
@@ -337,6 +338,7 @@ static struct apic apic_physflat = {
337 .name = "physical flat", 338 .name = "physical flat",
338 .probe = physflat_probe, 339 .probe = physflat_probe,
339 .acpi_madt_oem_check = physflat_acpi_madt_oem_check, 340 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
341 .apic_id_valid = default_apic_id_valid,
340 .apic_id_registered = flat_apic_id_registered, 342 .apic_id_registered = flat_apic_id_registered,
341 343
342 .irq_delivery_mode = dest_Fixed, 344 .irq_delivery_mode = dest_Fixed,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 775b82bc655c..634ae6cdd5c9 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -124,6 +124,7 @@ struct apic apic_noop = {
124 .probe = noop_probe, 124 .probe = noop_probe,
125 .acpi_madt_oem_check = NULL, 125 .acpi_madt_oem_check = NULL,
126 126
127 .apic_id_valid = default_apic_id_valid,
127 .apic_id_registered = noop_apic_id_registered, 128 .apic_id_registered = noop_apic_id_registered,
128 129
129 .irq_delivery_mode = dest_LowestPrio, 130 .irq_delivery_mode = dest_LowestPrio,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 09d3d8c1cd99..d9ea5f331ac5 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -56,6 +56,12 @@ static unsigned int read_xapic_id(void)
56 return get_apic_id(apic_read(APIC_ID)); 56 return get_apic_id(apic_read(APIC_ID));
57} 57}
58 58
59static int numachip_apic_id_valid(int apicid)
60{
61 /* Trust what bootloader passes in MADT */
62 return 1;
63}
64
59static int numachip_apic_id_registered(void) 65static int numachip_apic_id_registered(void)
60{ 66{
61 return physid_isset(read_xapic_id(), phys_cpu_present_map); 67 return physid_isset(read_xapic_id(), phys_cpu_present_map);
@@ -223,10 +229,11 @@ static int __init numachip_system_init(void)
223} 229}
224early_initcall(numachip_system_init); 230early_initcall(numachip_system_init);
225 231
226static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 232static int __cpuinit numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
227{ 233{
228 if (!strncmp(oem_id, "NUMASC", 6)) { 234 if (!strncmp(oem_id, "NUMASC", 6)) {
229 numachip_system = 1; 235 numachip_system = 1;
236 setup_force_cpu_cap(X86_FEATURE_X2APIC);
230 return 1; 237 return 1;
231 } 238 }
232 239
@@ -238,6 +245,7 @@ static struct apic apic_numachip __refconst = {
238 .name = "NumaConnect system", 245 .name = "NumaConnect system",
239 .probe = numachip_probe, 246 .probe = numachip_probe,
240 .acpi_madt_oem_check = numachip_acpi_madt_oem_check, 247 .acpi_madt_oem_check = numachip_acpi_madt_oem_check,
248 .apic_id_valid = numachip_apic_id_valid,
241 .apic_id_registered = numachip_apic_id_registered, 249 .apic_id_registered = numachip_apic_id_registered,
242 250
243 .irq_delivery_mode = dest_Fixed, 251 .irq_delivery_mode = dest_Fixed,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 521bead01137..0cdec7065aff 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -198,6 +198,7 @@ static struct apic apic_bigsmp = {
198 .name = "bigsmp", 198 .name = "bigsmp",
199 .probe = probe_bigsmp, 199 .probe = probe_bigsmp,
200 .acpi_madt_oem_check = NULL, 200 .acpi_madt_oem_check = NULL,
201 .apic_id_valid = default_apic_id_valid,
201 .apic_id_registered = bigsmp_apic_id_registered, 202 .apic_id_registered = bigsmp_apic_id_registered,
202 203
203 .irq_delivery_mode = dest_Fixed, 204 .irq_delivery_mode = dest_Fixed,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 5d513bc47b6b..e42d1d3b9134 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -625,6 +625,7 @@ static struct apic __refdata apic_es7000_cluster = {
625 .name = "es7000", 625 .name = "es7000",
626 .probe = probe_es7000, 626 .probe = probe_es7000,
627 .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster, 627 .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster,
628 .apic_id_valid = default_apic_id_valid,
628 .apic_id_registered = es7000_apic_id_registered, 629 .apic_id_registered = es7000_apic_id_registered,
629 630
630 .irq_delivery_mode = dest_LowestPrio, 631 .irq_delivery_mode = dest_LowestPrio,
@@ -690,6 +691,7 @@ static struct apic __refdata apic_es7000 = {
690 .name = "es7000", 691 .name = "es7000",
691 .probe = probe_es7000, 692 .probe = probe_es7000,
692 .acpi_madt_oem_check = es7000_acpi_madt_oem_check, 693 .acpi_madt_oem_check = es7000_acpi_madt_oem_check,
694 .apic_id_valid = default_apic_id_valid,
693 .apic_id_registered = es7000_apic_id_registered, 695 .apic_id_registered = es7000_apic_id_registered,
694 696
695 .irq_delivery_mode = dest_Fixed, 697 .irq_delivery_mode = dest_Fixed,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index fb072754bc1d..6d10a66fc5a9 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3967,18 +3967,36 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
3967static __init int bad_ioapic(unsigned long address) 3967static __init int bad_ioapic(unsigned long address)
3968{ 3968{
3969 if (nr_ioapics >= MAX_IO_APICS) { 3969 if (nr_ioapics >= MAX_IO_APICS) {
3970 printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded " 3970 pr_warn("WARNING: Max # of I/O APICs (%d) exceeded (found %d), skipping\n",
3971 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); 3971 MAX_IO_APICS, nr_ioapics);
3972 return 1; 3972 return 1;
3973 } 3973 }
3974 if (!address) { 3974 if (!address) {
3975 printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address" 3975 pr_warn("WARNING: Bogus (zero) I/O APIC address found in table, skipping!\n");
3976 " found in table, skipping!\n");
3977 return 1; 3976 return 1;
3978 } 3977 }
3979 return 0; 3978 return 0;
3980} 3979}
3981 3980
3981static __init int bad_ioapic_register(int idx)
3982{
3983 union IO_APIC_reg_00 reg_00;
3984 union IO_APIC_reg_01 reg_01;
3985 union IO_APIC_reg_02 reg_02;
3986
3987 reg_00.raw = io_apic_read(idx, 0);
3988 reg_01.raw = io_apic_read(idx, 1);
3989 reg_02.raw = io_apic_read(idx, 2);
3990
3991 if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) {
3992 pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n",
3993 mpc_ioapic_addr(idx));
3994 return 1;
3995 }
3996
3997 return 0;
3998}
3999
3982void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) 4000void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
3983{ 4001{
3984 int idx = 0; 4002 int idx = 0;
@@ -3995,6 +4013,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
3995 ioapics[idx].mp_config.apicaddr = address; 4013 ioapics[idx].mp_config.apicaddr = address;
3996 4014
3997 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 4015 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
4016
4017 if (bad_ioapic_register(idx)) {
4018 clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
4019 return;
4020 }
4021
3998 ioapics[idx].mp_config.apicid = io_apic_unique_id(id); 4022 ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
3999 ioapics[idx].mp_config.apicver = io_apic_get_version(idx); 4023 ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
4000 4024
@@ -4015,10 +4039,10 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4015 if (gsi_cfg->gsi_end >= gsi_top) 4039 if (gsi_cfg->gsi_end >= gsi_top)
4016 gsi_top = gsi_cfg->gsi_end + 1; 4040 gsi_top = gsi_cfg->gsi_end + 1;
4017 4041
4018 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " 4042 pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n",
4019 "GSI %d-%d\n", idx, mpc_ioapic_id(idx), 4043 idx, mpc_ioapic_id(idx),
4020 mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), 4044 mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
4021 gsi_cfg->gsi_base, gsi_cfg->gsi_end); 4045 gsi_cfg->gsi_base, gsi_cfg->gsi_end);
4022 4046
4023 nr_ioapics++; 4047 nr_ioapics++;
4024} 4048}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index c4a61ca1349a..00d2422ca7c9 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -478,6 +478,7 @@ static struct apic __refdata apic_numaq = {
478 .name = "NUMAQ", 478 .name = "NUMAQ",
479 .probe = probe_numaq, 479 .probe = probe_numaq,
480 .acpi_madt_oem_check = NULL, 480 .acpi_madt_oem_check = NULL,
481 .apic_id_valid = default_apic_id_valid,
481 .apic_id_registered = numaq_apic_id_registered, 482 .apic_id_registered = numaq_apic_id_registered,
482 483
483 .irq_delivery_mode = dest_LowestPrio, 484 .irq_delivery_mode = dest_LowestPrio,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0787bb3412f4..ff2c1b9aac4d 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -92,6 +92,7 @@ static struct apic apic_default = {
92 .name = "default", 92 .name = "default",
93 .probe = probe_default, 93 .probe = probe_default,
94 .acpi_madt_oem_check = NULL, 94 .acpi_madt_oem_check = NULL,
95 .apic_id_valid = default_apic_id_valid,
95 .apic_id_registered = default_apic_id_registered, 96 .apic_id_registered = default_apic_id_registered,
96 97
97 .irq_delivery_mode = dest_LowestPrio, 98 .irq_delivery_mode = dest_LowestPrio,
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 19114423c58c..fea000b27f07 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -496,6 +496,7 @@ static struct apic apic_summit = {
496 .name = "summit", 496 .name = "summit",
497 .probe = probe_summit, 497 .probe = probe_summit,
498 .acpi_madt_oem_check = summit_acpi_madt_oem_check, 498 .acpi_madt_oem_check = summit_acpi_madt_oem_check,
499 .apic_id_valid = default_apic_id_valid,
499 .apic_id_registered = summit_apic_id_registered, 500 .apic_id_registered = summit_apic_id_registered,
500 501
501 .irq_delivery_mode = dest_LowestPrio, 502 .irq_delivery_mode = dest_LowestPrio,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 500795875827..9193713060a9 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -213,6 +213,7 @@ static struct apic apic_x2apic_cluster = {
213 .name = "cluster x2apic", 213 .name = "cluster x2apic",
214 .probe = x2apic_cluster_probe, 214 .probe = x2apic_cluster_probe,
215 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 215 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
216 .apic_id_valid = default_apic_id_valid,
216 .apic_id_registered = x2apic_apic_id_registered, 217 .apic_id_registered = x2apic_apic_id_registered,
217 218
218 .irq_delivery_mode = dest_LowestPrio, 219 .irq_delivery_mode = dest_LowestPrio,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index f5373dfde21e..bcd1db6eaca9 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -119,6 +119,7 @@ static struct apic apic_x2apic_phys = {
119 .name = "physical x2apic", 119 .name = "physical x2apic",
120 .probe = x2apic_phys_probe, 120 .probe = x2apic_phys_probe,
121 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 121 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
122 .apic_id_valid = default_apic_id_valid,
122 .apic_id_registered = x2apic_apic_id_registered, 123 .apic_id_registered = x2apic_apic_id_registered,
123 124
124 .irq_delivery_mode = dest_Fixed, 125 .irq_delivery_mode = dest_Fixed,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 79b05b88aa19..fc4771425852 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -351,6 +351,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
351 .name = "UV large system", 351 .name = "UV large system",
352 .probe = uv_probe, 352 .probe = uv_probe,
353 .acpi_madt_oem_check = uv_acpi_madt_oem_check, 353 .acpi_madt_oem_check = uv_acpi_madt_oem_check,
354 .apic_id_valid = default_apic_id_valid,
354 .apic_id_registered = uv_apic_id_registered, 355 .apic_id_registered = uv_apic_id_registered,
355 356
356 .irq_delivery_mode = dest_Fixed, 357 .irq_delivery_mode = dest_Fixed,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index f76623cbe263..5d56931a15b3 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1234,8 +1234,7 @@ static int suspend(int vetoable)
1234 struct apm_user *as; 1234 struct apm_user *as;
1235 1235
1236 dpm_suspend_start(PMSG_SUSPEND); 1236 dpm_suspend_start(PMSG_SUSPEND);
1237 1237 dpm_suspend_end(PMSG_SUSPEND);
1238 dpm_suspend_noirq(PMSG_SUSPEND);
1239 1238
1240 local_irq_disable(); 1239 local_irq_disable();
1241 syscore_suspend(); 1240 syscore_suspend();
@@ -1259,9 +1258,9 @@ static int suspend(int vetoable)
1259 syscore_resume(); 1258 syscore_resume();
1260 local_irq_enable(); 1259 local_irq_enable();
1261 1260
1262 dpm_resume_noirq(PMSG_RESUME); 1261 dpm_resume_start(PMSG_RESUME);
1263
1264 dpm_resume_end(PMSG_RESUME); 1262 dpm_resume_end(PMSG_RESUME);
1263
1265 queue_event(APM_NORMAL_RESUME, NULL); 1264 queue_event(APM_NORMAL_RESUME, NULL);
1266 spin_lock(&user_list_lock); 1265 spin_lock(&user_list_lock);
1267 for (as = user_list; as != NULL; as = as->next) { 1266 for (as = user_list; as != NULL; as = as->next) {
@@ -1277,7 +1276,7 @@ static void standby(void)
1277{ 1276{
1278 int err; 1277 int err;
1279 1278
1280 dpm_suspend_noirq(PMSG_SUSPEND); 1279 dpm_suspend_end(PMSG_SUSPEND);
1281 1280
1282 local_irq_disable(); 1281 local_irq_disable();
1283 syscore_suspend(); 1282 syscore_suspend();
@@ -1291,7 +1290,7 @@ static void standby(void)
1291 syscore_resume(); 1290 syscore_resume();
1292 local_irq_enable(); 1291 local_irq_enable();
1293 1292
1294 dpm_resume_noirq(PMSG_RESUME); 1293 dpm_resume_start(PMSG_RESUME);
1295} 1294}
1296 1295
1297static apm_event_t get_event(void) 1296static apm_event_t get_event(void)
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 25f24dccdcfa..6ab6aa2fdfdd 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -16,6 +16,7 @@ obj-y := intel_cacheinfo.o scattered.o topology.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o sched.o mshyperv.o 17obj-y += vmware.o hypervisor.o sched.o mshyperv.o
18obj-y += rdrand.o 18obj-y += rdrand.o
19obj-y += match.o
19 20
20obj-$(CONFIG_X86_32) += bugs.o 21obj-$(CONFIG_X86_32) += bugs.o
21obj-$(CONFIG_X86_64) += bugs_64.o 22obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f4773f4aae35..0a44b90602b0 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6 6
7#include <linux/io.h> 7#include <linux/io.h>
8#include <linux/sched.h>
8#include <asm/processor.h> 9#include <asm/processor.h>
9#include <asm/apic.h> 10#include <asm/apic.h>
10#include <asm/cpu.h> 11#include <asm/cpu.h>
@@ -456,6 +457,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
456 if (c->x86_power & (1 << 8)) { 457 if (c->x86_power & (1 << 8)) {
457 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 458 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
458 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 459 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
460 if (!check_tsc_unstable())
461 sched_clock_stable = 1;
459 } 462 }
460 463
461#ifdef CONFIG_X86_64 464#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d43cad74f166..e49477444fff 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -18,6 +18,7 @@
18#include <asm/archrandom.h> 18#include <asm/archrandom.h>
19#include <asm/hypervisor.h> 19#include <asm/hypervisor.h>
20#include <asm/processor.h> 20#include <asm/processor.h>
21#include <asm/debugreg.h>
21#include <asm/sections.h> 22#include <asm/sections.h>
22#include <linux/topology.h> 23#include <linux/topology.h>
23#include <linux/cpumask.h> 24#include <linux/cpumask.h>
@@ -28,6 +29,7 @@
28#include <asm/apic.h> 29#include <asm/apic.h>
29#include <asm/desc.h> 30#include <asm/desc.h>
30#include <asm/i387.h> 31#include <asm/i387.h>
32#include <asm/fpu-internal.h>
31#include <asm/mtrr.h> 33#include <asm/mtrr.h>
32#include <linux/numa.h> 34#include <linux/numa.h>
33#include <asm/asm.h> 35#include <asm/asm.h>
@@ -933,7 +935,7 @@ static const struct msr_range msr_range_array[] __cpuinitconst = {
933 { 0xc0011000, 0xc001103b}, 935 { 0xc0011000, 0xc001103b},
934}; 936};
935 937
936static void __cpuinit print_cpu_msr(void) 938static void __cpuinit __print_cpu_msr(void)
937{ 939{
938 unsigned index_min, index_max; 940 unsigned index_min, index_max;
939 unsigned index; 941 unsigned index;
@@ -997,13 +999,13 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
997 else 999 else
998 printk(KERN_CONT "\n"); 1000 printk(KERN_CONT "\n");
999 1001
1000#ifdef CONFIG_SMP 1002 __print_cpu_msr();
1003}
1004
1005void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c)
1006{
1001 if (c->cpu_index < show_msr) 1007 if (c->cpu_index < show_msr)
1002 print_cpu_msr(); 1008 __print_cpu_msr();
1003#else
1004 if (show_msr)
1005 print_cpu_msr();
1006#endif
1007} 1009}
1008 1010
1009static __init int setup_disablecpuid(char *arg) 1011static __init int setup_disablecpuid(char *arg)
@@ -1044,6 +1046,8 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
1044 1046
1045DEFINE_PER_CPU(unsigned int, irq_count) = -1; 1047DEFINE_PER_CPU(unsigned int, irq_count) = -1;
1046 1048
1049DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1050
1047/* 1051/*
1048 * Special IST stacks which the CPU switches to when it calls 1052 * Special IST stacks which the CPU switches to when it calls
1049 * an IST-marked descriptor entry. Up to 7 stacks (hardware 1053 * an IST-marked descriptor entry. Up to 7 stacks (hardware
@@ -1111,6 +1115,7 @@ void debug_stack_reset(void)
1111 1115
1112DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 1116DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
1113EXPORT_PER_CPU_SYMBOL(current_task); 1117EXPORT_PER_CPU_SYMBOL(current_task);
1118DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1114 1119
1115#ifdef CONFIG_CC_STACKPROTECTOR 1120#ifdef CONFIG_CC_STACKPROTECTOR
1116DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); 1121DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 6b45e5e7a901..73d08ed98a64 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -326,8 +326,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb)
326 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; 326 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
327} 327}
328 328
329static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, 329static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
330 int index)
331{ 330{
332 int node; 331 int node;
333 332
@@ -725,14 +724,16 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
725#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) 724#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
726 725
727#ifdef CONFIG_SMP 726#ifdef CONFIG_SMP
728static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) 727
728static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
729{ 729{
730 struct _cpuid4_info *this_leaf, *sibling_leaf; 730 struct _cpuid4_info *this_leaf;
731 unsigned long num_threads_sharing; 731 int ret, i, sibling;
732 int index_msb, i, sibling;
733 struct cpuinfo_x86 *c = &cpu_data(cpu); 732 struct cpuinfo_x86 *c = &cpu_data(cpu);
734 733
735 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { 734 ret = 0;
735 if (index == 3) {
736 ret = 1;
736 for_each_cpu(i, cpu_llc_shared_mask(cpu)) { 737 for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
737 if (!per_cpu(ici_cpuid4_info, i)) 738 if (!per_cpu(ici_cpuid4_info, i))
738 continue; 739 continue;
@@ -743,8 +744,35 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
743 set_bit(sibling, this_leaf->shared_cpu_map); 744 set_bit(sibling, this_leaf->shared_cpu_map);
744 } 745 }
745 } 746 }
746 return; 747 } else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) {
748 ret = 1;
749 for_each_cpu(i, cpu_sibling_mask(cpu)) {
750 if (!per_cpu(ici_cpuid4_info, i))
751 continue;
752 this_leaf = CPUID4_INFO_IDX(i, index);
753 for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
754 if (!cpu_online(sibling))
755 continue;
756 set_bit(sibling, this_leaf->shared_cpu_map);
757 }
758 }
747 } 759 }
760
761 return ret;
762}
763
764static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
765{
766 struct _cpuid4_info *this_leaf, *sibling_leaf;
767 unsigned long num_threads_sharing;
768 int index_msb, i;
769 struct cpuinfo_x86 *c = &cpu_data(cpu);
770
771 if (c->x86_vendor == X86_VENDOR_AMD) {
772 if (cache_shared_amd_cpu_map_setup(cpu, index))
773 return;
774 }
775
748 this_leaf = CPUID4_INFO_IDX(cpu, index); 776 this_leaf = CPUID4_INFO_IDX(cpu, index);
749 num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; 777 num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing;
750 778
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
new file mode 100644
index 000000000000..5502b289341b
--- /dev/null
+++ b/arch/x86/kernel/cpu/match.c
@@ -0,0 +1,91 @@
1#include <asm/cpu_device_id.h>
2#include <asm/processor.h>
3#include <linux/cpu.h>
4#include <linux/module.h>
5#include <linux/slab.h>
6
7/**
8 * x86_match_cpu - match current CPU again an array of x86_cpu_ids
9 * @match: Pointer to array of x86_cpu_ids. Last entry terminated with
10 * {}.
11 *
12 * Return the entry if the current CPU matches the entries in the
13 * passed x86_cpu_id match table. Otherwise NULL. The match table
14 * contains vendor (X86_VENDOR_*), family, model and feature bits or
15 * respective wildcard entries.
16 *
17 * A typical table entry would be to match a specific CPU
18 * { X86_VENDOR_INTEL, 6, 0x12 }
19 * or to match a specific CPU feature
20 * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
21 *
22 * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
23 * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
24 *
25 * Arrays used to match for this should also be declared using
26 * MODULE_DEVICE_TABLE(x86_cpu, ...)
27 *
28 * This always matches against the boot cpu, assuming models and features are
29 * consistent over all CPUs.
30 */
31const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
32{
33 const struct x86_cpu_id *m;
34 struct cpuinfo_x86 *c = &boot_cpu_data;
35
36 for (m = match; m->vendor | m->family | m->model | m->feature; m++) {
37 if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
38 continue;
39 if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
40 continue;
41 if (m->model != X86_MODEL_ANY && c->x86_model != m->model)
42 continue;
43 if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
44 continue;
45 return m;
46 }
47 return NULL;
48}
49EXPORT_SYMBOL(x86_match_cpu);
50
51ssize_t arch_print_cpu_modalias(struct device *dev,
52 struct device_attribute *attr,
53 char *bufptr)
54{
55 int size = PAGE_SIZE;
56 int i, n;
57 char *buf = bufptr;
58
59 n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:"
60 "model:%04X:feature:",
61 boot_cpu_data.x86_vendor,
62 boot_cpu_data.x86,
63 boot_cpu_data.x86_model);
64 size -= n;
65 buf += n;
66 size -= 1;
67 for (i = 0; i < NCAPINTS*32; i++) {
68 if (boot_cpu_has(i)) {
69 n = snprintf(buf, size, ",%04X", i);
70 if (n >= size) {
71 WARN(1, "x86 features overflow page\n");
72 break;
73 }
74 size -= n;
75 buf += n;
76 }
77 }
78 *buf++ = '\n';
79 return buf - bufptr;
80}
81
82int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env)
83{
84 char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
85 if (buf) {
86 arch_print_cpu_modalias(NULL, NULL, buf);
87 add_uevent_var(env, "MODALIAS=%s", buf);
88 kfree(buf);
89 }
90 return 0;
91}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 7395d5f4272d..0c82091b1652 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -54,7 +54,14 @@ static struct severity {
54#define MASK(x, y) .mask = x, .result = y 54#define MASK(x, y) .mask = x, .result = y
55#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) 55#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
56#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) 56#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
57#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
57#define MCACOD 0xffff 58#define MCACOD 0xffff
59/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
60#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */
61#define MCACOD_SCRUBMSK 0xfff0
62#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */
63#define MCACOD_DATA 0x0134 /* Data Load */
64#define MCACOD_INSTR 0x0150 /* Instruction Fetch */
58 65
59 MCESEV( 66 MCESEV(
60 NO, "Invalid", 67 NO, "Invalid",
@@ -102,11 +109,24 @@ static struct severity {
102 SER, BITCLR(MCI_STATUS_S) 109 SER, BITCLR(MCI_STATUS_S)
103 ), 110 ),
104 111
105 /* AR add known MCACODs here */
106 MCESEV( 112 MCESEV(
107 PANIC, "Action required with lost events", 113 PANIC, "Action required with lost events",
108 SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) 114 SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
109 ), 115 ),
116
117 /* known AR MCACODs: */
118#ifdef CONFIG_MEMORY_FAILURE
119 MCESEV(
120 KEEP, "HT thread notices Action required: data load error",
121 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
122 MCGMASK(MCG_STATUS_EIPV, 0)
123 ),
124 MCESEV(
125 AR, "Action required: data load error",
126 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
127 USER
128 ),
129#endif
110 MCESEV( 130 MCESEV(
111 PANIC, "Action required: unknown MCACOD", 131 PANIC, "Action required: unknown MCACOD",
112 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) 132 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
@@ -115,11 +135,11 @@ static struct severity {
115 /* known AO MCACODs: */ 135 /* known AO MCACODs: */
116 MCESEV( 136 MCESEV(
117 AO, "Action optional: memory scrubbing error", 137 AO, "Action optional: memory scrubbing error",
118 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0) 138 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
119 ), 139 ),
120 MCESEV( 140 MCESEV(
121 AO, "Action optional: last level cache writeback error", 141 AO, "Action optional: last level cache writeback error",
122 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a) 142 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
123 ), 143 ),
124 MCESEV( 144 MCESEV(
125 SOME, "Action optional: unknown MCACOD", 145 SOME, "Action optional: unknown MCACOD",
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5a11ae2e9e91..d086a09c087d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -191,7 +191,7 @@ static void drain_mcelog_buffer(void)
191{ 191{
192 unsigned int next, i, prev = 0; 192 unsigned int next, i, prev = 0;
193 193
194 next = rcu_dereference_check_mce(mcelog.next); 194 next = ACCESS_ONCE(mcelog.next);
195 195
196 do { 196 do {
197 struct mce *m; 197 struct mce *m;
@@ -540,6 +540,27 @@ static void mce_report_event(struct pt_regs *regs)
540 irq_work_queue(&__get_cpu_var(mce_irq_work)); 540 irq_work_queue(&__get_cpu_var(mce_irq_work));
541} 541}
542 542
543/*
544 * Read ADDR and MISC registers.
545 */
546static void mce_read_aux(struct mce *m, int i)
547{
548 if (m->status & MCI_STATUS_MISCV)
549 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
550 if (m->status & MCI_STATUS_ADDRV) {
551 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
552
553 /*
554 * Mask the reported address by the reported granularity.
555 */
556 if (mce_ser && (m->status & MCI_STATUS_MISCV)) {
557 u8 shift = MCI_MISC_ADDR_LSB(m->misc);
558 m->addr >>= shift;
559 m->addr <<= shift;
560 }
561 }
562}
563
543DEFINE_PER_CPU(unsigned, mce_poll_count); 564DEFINE_PER_CPU(unsigned, mce_poll_count);
544 565
545/* 566/*
@@ -590,10 +611,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
590 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 611 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
591 continue; 612 continue;
592 613
593 if (m.status & MCI_STATUS_MISCV) 614 mce_read_aux(&m, i);
594 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
595 if (m.status & MCI_STATUS_ADDRV)
596 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
597 615
598 if (!(flags & MCP_TIMESTAMP)) 616 if (!(flags & MCP_TIMESTAMP))
599 m.tsc = 0; 617 m.tsc = 0;
@@ -917,6 +935,49 @@ static void mce_clear_state(unsigned long *toclear)
917} 935}
918 936
919/* 937/*
938 * Need to save faulting physical address associated with a process
939 * in the machine check handler some place where we can grab it back
940 * later in mce_notify_process()
941 */
942#define MCE_INFO_MAX 16
943
944struct mce_info {
945 atomic_t inuse;
946 struct task_struct *t;
947 __u64 paddr;
948} mce_info[MCE_INFO_MAX];
949
950static void mce_save_info(__u64 addr)
951{
952 struct mce_info *mi;
953
954 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
955 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
956 mi->t = current;
957 mi->paddr = addr;
958 return;
959 }
960 }
961
962 mce_panic("Too many concurrent recoverable errors", NULL, NULL);
963}
964
965static struct mce_info *mce_find_info(void)
966{
967 struct mce_info *mi;
968
969 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
970 if (atomic_read(&mi->inuse) && mi->t == current)
971 return mi;
972 return NULL;
973}
974
975static void mce_clear_info(struct mce_info *mi)
976{
977 atomic_set(&mi->inuse, 0);
978}
979
980/*
920 * The actual machine check handler. This only handles real 981 * The actual machine check handler. This only handles real
921 * exceptions when something got corrupted coming in through int 18. 982 * exceptions when something got corrupted coming in through int 18.
922 * 983 *
@@ -969,7 +1030,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
969 barrier(); 1030 barrier();
970 1031
971 /* 1032 /*
972 * When no restart IP must always kill or panic. 1033 * When no restart IP might need to kill or panic.
1034 * Assume the worst for now, but if we find the
1035 * severity is MCE_AR_SEVERITY we have other options.
973 */ 1036 */
974 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 1037 if (!(m.mcgstatus & MCG_STATUS_RIPV))
975 kill_it = 1; 1038 kill_it = 1;
@@ -1023,16 +1086,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1023 continue; 1086 continue;
1024 } 1087 }
1025 1088
1026 /* 1089 mce_read_aux(&m, i);
1027 * Kill on action required.
1028 */
1029 if (severity == MCE_AR_SEVERITY)
1030 kill_it = 1;
1031
1032 if (m.status & MCI_STATUS_MISCV)
1033 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
1034 if (m.status & MCI_STATUS_ADDRV)
1035 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
1036 1090
1037 /* 1091 /*
1038 * Action optional error. Queue address for later processing. 1092 * Action optional error. Queue address for later processing.
@@ -1052,6 +1106,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1052 } 1106 }
1053 } 1107 }
1054 1108
1109 /* mce_clear_state will clear *final, save locally for use later */
1110 m = *final;
1111
1055 if (!no_way_out) 1112 if (!no_way_out)
1056 mce_clear_state(toclear); 1113 mce_clear_state(toclear);
1057 1114
@@ -1063,27 +1120,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1063 no_way_out = worst >= MCE_PANIC_SEVERITY; 1120 no_way_out = worst >= MCE_PANIC_SEVERITY;
1064 1121
1065 /* 1122 /*
1066 * If we have decided that we just CAN'T continue, and the user 1123 * At insane "tolerant" levels we take no action. Otherwise
1067 * has not set tolerant to an insane level, give up and die. 1124 * we only die if we have no other choice. For less serious
1068 * 1125 * issues we try to recover, or limit damage to the current
1069 * This is mainly used in the case when the system doesn't 1126 * process.
1070 * support MCE broadcasting or it has been disabled.
1071 */
1072 if (no_way_out && tolerant < 3)
1073 mce_panic("Fatal machine check on current CPU", final, msg);
1074
1075 /*
1076 * If the error seems to be unrecoverable, something should be
1077 * done. Try to kill as little as possible. If we can kill just
1078 * one task, do that. If the user has set the tolerance very
1079 * high, don't try to do anything at all.
1080 */ 1127 */
1081 1128 if (tolerant < 3) {
1082 if (kill_it && tolerant < 3) 1129 if (no_way_out)
1083 force_sig(SIGBUS, current); 1130 mce_panic("Fatal machine check on current CPU", &m, msg);
1084 1131 if (worst == MCE_AR_SEVERITY) {
1085 /* notify userspace ASAP */ 1132 /* schedule action before return to userland */
1086 set_thread_flag(TIF_MCE_NOTIFY); 1133 mce_save_info(m.addr);
1134 set_thread_flag(TIF_MCE_NOTIFY);
1135 } else if (kill_it) {
1136 force_sig(SIGBUS, current);
1137 }
1138 }
1087 1139
1088 if (worst > 0) 1140 if (worst > 0)
1089 mce_report_event(regs); 1141 mce_report_event(regs);
@@ -1094,34 +1146,57 @@ out:
1094} 1146}
1095EXPORT_SYMBOL_GPL(do_machine_check); 1147EXPORT_SYMBOL_GPL(do_machine_check);
1096 1148
1097/* dummy to break dependency. actual code is in mm/memory-failure.c */ 1149#ifndef CONFIG_MEMORY_FAILURE
1098void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) 1150int memory_failure(unsigned long pfn, int vector, int flags)
1099{ 1151{
1100 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); 1152 /* mce_severity() should not hand us an ACTION_REQUIRED error */
1153 BUG_ON(flags & MF_ACTION_REQUIRED);
1154 printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"
1155 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);
1156
1157 return 0;
1101} 1158}
1159#endif
1102 1160
1103/* 1161/*
1104 * Called after mce notification in process context. This code 1162 * Called in process context that interrupted by MCE and marked with
1105 * is allowed to sleep. Call the high level VM handler to process 1163 * TIF_MCE_NOTIFY, just before returning to erroneous userland.
1106 * any corrupted pages. 1164 * This code is allowed to sleep.
1107 * Assume that the work queue code only calls this one at a time 1165 * Attempt possible recovery such as calling the high level VM handler to
1108 * per CPU. 1166 * process any corrupted pages, and kill/signal current process if required.
1109 * Note we don't disable preemption, so this code might run on the wrong 1167 * Action required errors are handled here.
1110 * CPU. In this case the event is picked up by the scheduled work queue.
1111 * This is merely a fast path to expedite processing in some common
1112 * cases.
1113 */ 1168 */
1114void mce_notify_process(void) 1169void mce_notify_process(void)
1115{ 1170{
1116 unsigned long pfn; 1171 unsigned long pfn;
1117 mce_notify_irq(); 1172 struct mce_info *mi = mce_find_info();
1118 while (mce_ring_get(&pfn)) 1173
1119 memory_failure(pfn, MCE_VECTOR); 1174 if (!mi)
1175 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
1176 pfn = mi->paddr >> PAGE_SHIFT;
1177
1178 clear_thread_flag(TIF_MCE_NOTIFY);
1179
1180 pr_err("Uncorrected hardware memory error in user-access at %llx",
1181 mi->paddr);
1182 if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) {
1183 pr_err("Memory error not recovered");
1184 force_sig(SIGBUS, current);
1185 }
1186 mce_clear_info(mi);
1120} 1187}
1121 1188
1189/*
1190 * Action optional processing happens here (picking up
1191 * from the list of faulting pages that do_machine_check()
1192 * placed into the "ring").
1193 */
1122static void mce_process_work(struct work_struct *dummy) 1194static void mce_process_work(struct work_struct *dummy)
1123{ 1195{
1124 mce_notify_process(); 1196 unsigned long pfn;
1197
1198 while (mce_ring_get(&pfn))
1199 memory_failure(pfn, MCE_VECTOR, 0);
1125} 1200}
1126 1201
1127#ifdef CONFIG_X86_MCE_INTEL 1202#ifdef CONFIG_X86_MCE_INTEL
@@ -1211,8 +1286,6 @@ int mce_notify_irq(void)
1211 /* Not more than two messages every minute */ 1286 /* Not more than two messages every minute */
1212 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1287 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1213 1288
1214 clear_thread_flag(TIF_MCE_NOTIFY);
1215
1216 if (test_and_clear_bit(0, &mce_need_notify)) { 1289 if (test_and_clear_bit(0, &mce_need_notify)) {
1217 /* wake processes polling /dev/mcelog */ 1290 /* wake processes polling /dev/mcelog */
1218 wake_up_interruptible(&mce_chrdev_wait); 1291 wake_up_interruptible(&mce_chrdev_wait);
@@ -1541,6 +1614,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
1541 /* Error or no more MCE record */ 1614 /* Error or no more MCE record */
1542 if (rc <= 0) { 1615 if (rc <= 0) {
1543 mce_apei_read_done = 1; 1616 mce_apei_read_done = 1;
1617 /*
1618 * When ERST is disabled, mce_chrdev_read() should return
1619 * "no record" instead of "no device."
1620 */
1621 if (rc == -ENODEV)
1622 return 0;
1544 return rc; 1623 return rc;
1545 } 1624 }
1546 rc = -EFAULT; 1625 rc = -EFAULT;
@@ -1859,7 +1938,7 @@ static struct bus_type mce_subsys = {
1859 .dev_name = "machinecheck", 1938 .dev_name = "machinecheck",
1860}; 1939};
1861 1940
1862struct device *mce_device[CONFIG_NR_CPUS]; 1941DEFINE_PER_CPU(struct device *, mce_device);
1863 1942
1864__cpuinitdata 1943__cpuinitdata
1865void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1944void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -2038,7 +2117,7 @@ static __cpuinit int mce_device_create(unsigned int cpu)
2038 goto error2; 2117 goto error2;
2039 } 2118 }
2040 cpumask_set_cpu(cpu, mce_device_initialized); 2119 cpumask_set_cpu(cpu, mce_device_initialized);
2041 mce_device[cpu] = dev; 2120 per_cpu(mce_device, cpu) = dev;
2042 2121
2043 return 0; 2122 return 0;
2044error2: 2123error2:
@@ -2055,7 +2134,7 @@ error:
2055 2134
2056static __cpuinit void mce_device_remove(unsigned int cpu) 2135static __cpuinit void mce_device_remove(unsigned int cpu)
2057{ 2136{
2058 struct device *dev = mce_device[cpu]; 2137 struct device *dev = per_cpu(mce_device, cpu);
2059 int i; 2138 int i;
2060 2139
2061 if (!cpumask_test_cpu(cpu, mce_device_initialized)) 2140 if (!cpumask_test_cpu(cpu, mce_device_initialized))
@@ -2069,7 +2148,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu)
2069 2148
2070 device_unregister(dev); 2149 device_unregister(dev);
2071 cpumask_clear_cpu(cpu, mce_device_initialized); 2150 cpumask_clear_cpu(cpu, mce_device_initialized);
2072 mce_device[cpu] = NULL; 2151 per_cpu(mce_device, cpu) = NULL;
2073} 2152}
2074 2153
2075/* Make sure there are no machine checks on offlined CPUs. */ 2154/* Make sure there are no machine checks on offlined CPUs. */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 786e76a86322..99b57179f912 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -523,11 +523,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
523{ 523{
524 int i, err = 0; 524 int i, err = 0;
525 struct threshold_bank *b = NULL; 525 struct threshold_bank *b = NULL;
526 struct device *dev = mce_device[cpu]; 526 struct device *dev = per_cpu(mce_device, cpu);
527 char name[32]; 527 char name[32];
528 528
529 sprintf(name, "threshold_bank%i", bank); 529 sprintf(name, "threshold_bank%i", bank);
530 530
531#ifdef CONFIG_SMP
531 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 532 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
532 i = cpumask_first(cpu_llc_shared_mask(cpu)); 533 i = cpumask_first(cpu_llc_shared_mask(cpu));
533 534
@@ -553,6 +554,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
553 554
554 goto out; 555 goto out;
555 } 556 }
557#endif
556 558
557 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); 559 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
558 if (!b) { 560 if (!b) {
@@ -585,7 +587,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
585 if (i == cpu) 587 if (i == cpu)
586 continue; 588 continue;
587 589
588 dev = mce_device[i]; 590 dev = per_cpu(mce_device, i);
589 if (dev) 591 if (dev)
590 err = sysfs_create_link(&dev->kobj,b->kobj, name); 592 err = sysfs_create_link(&dev->kobj,b->kobj, name);
591 if (err) 593 if (err)
@@ -665,7 +667,8 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
665#ifdef CONFIG_SMP 667#ifdef CONFIG_SMP
666 /* sibling symlink */ 668 /* sibling symlink */
667 if (shared_bank[bank] && b->blocks->cpu != cpu) { 669 if (shared_bank[bank] && b->blocks->cpu != cpu) {
668 sysfs_remove_link(&mce_device[cpu]->kobj, name); 670 dev = per_cpu(mce_device, cpu);
671 sysfs_remove_link(&dev->kobj, name);
669 per_cpu(threshold_banks, cpu)[bank] = NULL; 672 per_cpu(threshold_banks, cpu)[bank] = NULL;
670 673
671 return; 674 return;
@@ -677,7 +680,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
677 if (i == cpu) 680 if (i == cpu)
678 continue; 681 continue;
679 682
680 dev = mce_device[i]; 683 dev = per_cpu(mce_device, i);
681 if (dev) 684 if (dev)
682 sysfs_remove_link(&dev->kobj, name); 685 sysfs_remove_link(&dev->kobj, name);
683 per_cpu(threshold_banks, i)[bank] = NULL; 686 per_cpu(threshold_banks, i)[bank] = NULL;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5adce1040b11..fa2900c0e398 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -24,6 +24,7 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/cpu.h> 25#include <linux/cpu.h>
26#include <linux/bitops.h> 26#include <linux/bitops.h>
27#include <linux/device.h>
27 28
28#include <asm/apic.h> 29#include <asm/apic.h>
29#include <asm/stacktrace.h> 30#include <asm/stacktrace.h>
@@ -31,6 +32,7 @@
31#include <asm/compat.h> 32#include <asm/compat.h>
32#include <asm/smp.h> 33#include <asm/smp.h>
33#include <asm/alternative.h> 34#include <asm/alternative.h>
35#include <asm/timer.h>
34 36
35#include "perf_event.h" 37#include "perf_event.h"
36 38
@@ -351,6 +353,36 @@ int x86_setup_perfctr(struct perf_event *event)
351 return 0; 353 return 0;
352} 354}
353 355
356/*
357 * check that branch_sample_type is compatible with
358 * settings needed for precise_ip > 1 which implies
359 * using the LBR to capture ALL taken branches at the
360 * priv levels of the measurement
361 */
362static inline int precise_br_compat(struct perf_event *event)
363{
364 u64 m = event->attr.branch_sample_type;
365 u64 b = 0;
366
367 /* must capture all branches */
368 if (!(m & PERF_SAMPLE_BRANCH_ANY))
369 return 0;
370
371 m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
372
373 if (!event->attr.exclude_user)
374 b |= PERF_SAMPLE_BRANCH_USER;
375
376 if (!event->attr.exclude_kernel)
377 b |= PERF_SAMPLE_BRANCH_KERNEL;
378
379 /*
380 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
381 */
382
383 return m == b;
384}
385
354int x86_pmu_hw_config(struct perf_event *event) 386int x86_pmu_hw_config(struct perf_event *event)
355{ 387{
356 if (event->attr.precise_ip) { 388 if (event->attr.precise_ip) {
@@ -367,6 +399,36 @@ int x86_pmu_hw_config(struct perf_event *event)
367 399
368 if (event->attr.precise_ip > precise) 400 if (event->attr.precise_ip > precise)
369 return -EOPNOTSUPP; 401 return -EOPNOTSUPP;
402 /*
403 * check that PEBS LBR correction does not conflict with
404 * whatever the user is asking with attr->branch_sample_type
405 */
406 if (event->attr.precise_ip > 1) {
407 u64 *br_type = &event->attr.branch_sample_type;
408
409 if (has_branch_stack(event)) {
410 if (!precise_br_compat(event))
411 return -EOPNOTSUPP;
412
413 /* branch_sample_type is compatible */
414
415 } else {
416 /*
417 * user did not specify branch_sample_type
418 *
419 * For PEBS fixups, we capture all
420 * the branches at the priv level of the
421 * event.
422 */
423 *br_type = PERF_SAMPLE_BRANCH_ANY;
424
425 if (!event->attr.exclude_user)
426 *br_type |= PERF_SAMPLE_BRANCH_USER;
427
428 if (!event->attr.exclude_kernel)
429 *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
430 }
431 }
370 } 432 }
371 433
372 /* 434 /*
@@ -424,6 +486,10 @@ static int __x86_pmu_event_init(struct perf_event *event)
424 /* mark unused */ 486 /* mark unused */
425 event->hw.extra_reg.idx = EXTRA_REG_NONE; 487 event->hw.extra_reg.idx = EXTRA_REG_NONE;
426 488
489 /* mark not used */
490 event->hw.extra_reg.idx = EXTRA_REG_NONE;
491 event->hw.branch_reg.idx = EXTRA_REG_NONE;
492
427 return x86_pmu.hw_config(event); 493 return x86_pmu.hw_config(event);
428} 494}
429 495
@@ -577,14 +643,14 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
577 /* Prefer fixed purpose counters */ 643 /* Prefer fixed purpose counters */
578 if (x86_pmu.num_counters_fixed) { 644 if (x86_pmu.num_counters_fixed) {
579 idx = X86_PMC_IDX_FIXED; 645 idx = X86_PMC_IDX_FIXED;
580 for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { 646 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
581 if (!__test_and_set_bit(idx, sched->state.used)) 647 if (!__test_and_set_bit(idx, sched->state.used))
582 goto done; 648 goto done;
583 } 649 }
584 } 650 }
585 /* Grab the first unused counter starting with idx */ 651 /* Grab the first unused counter starting with idx */
586 idx = sched->state.counter; 652 idx = sched->state.counter;
587 for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { 653 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
588 if (!__test_and_set_bit(idx, sched->state.used)) 654 if (!__test_and_set_bit(idx, sched->state.used))
589 goto done; 655 goto done;
590 } 656 }
@@ -1210,6 +1276,8 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1210 break; 1276 break;
1211 1277
1212 case CPU_STARTING: 1278 case CPU_STARTING:
1279 if (x86_pmu.attr_rdpmc)
1280 set_in_cr4(X86_CR4_PCE);
1213 if (x86_pmu.cpu_starting) 1281 if (x86_pmu.cpu_starting)
1214 x86_pmu.cpu_starting(cpu); 1282 x86_pmu.cpu_starting(cpu);
1215 break; 1283 break;
@@ -1319,6 +1387,8 @@ static int __init init_hw_perf_events(void)
1319 } 1387 }
1320 } 1388 }
1321 1389
1390 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
1391
1322 pr_info("... version: %d\n", x86_pmu.version); 1392 pr_info("... version: %d\n", x86_pmu.version);
1323 pr_info("... bit width: %d\n", x86_pmu.cntval_bits); 1393 pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
1324 pr_info("... generic registers: %d\n", x86_pmu.num_counters); 1394 pr_info("... generic registers: %d\n", x86_pmu.num_counters);
@@ -1542,23 +1612,106 @@ static int x86_pmu_event_init(struct perf_event *event)
1542 return err; 1612 return err;
1543} 1613}
1544 1614
1615static int x86_pmu_event_idx(struct perf_event *event)
1616{
1617 int idx = event->hw.idx;
1618
1619 if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) {
1620 idx -= X86_PMC_IDX_FIXED;
1621 idx |= 1 << 30;
1622 }
1623
1624 return idx + 1;
1625}
1626
1627static ssize_t get_attr_rdpmc(struct device *cdev,
1628 struct device_attribute *attr,
1629 char *buf)
1630{
1631 return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
1632}
1633
1634static void change_rdpmc(void *info)
1635{
1636 bool enable = !!(unsigned long)info;
1637
1638 if (enable)
1639 set_in_cr4(X86_CR4_PCE);
1640 else
1641 clear_in_cr4(X86_CR4_PCE);
1642}
1643
1644static ssize_t set_attr_rdpmc(struct device *cdev,
1645 struct device_attribute *attr,
1646 const char *buf, size_t count)
1647{
1648 unsigned long val = simple_strtoul(buf, NULL, 0);
1649
1650 if (!!val != !!x86_pmu.attr_rdpmc) {
1651 x86_pmu.attr_rdpmc = !!val;
1652 smp_call_function(change_rdpmc, (void *)val, 1);
1653 }
1654
1655 return count;
1656}
1657
1658static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
1659
1660static struct attribute *x86_pmu_attrs[] = {
1661 &dev_attr_rdpmc.attr,
1662 NULL,
1663};
1664
1665static struct attribute_group x86_pmu_attr_group = {
1666 .attrs = x86_pmu_attrs,
1667};
1668
1669static const struct attribute_group *x86_pmu_attr_groups[] = {
1670 &x86_pmu_attr_group,
1671 NULL,
1672};
1673
1674static void x86_pmu_flush_branch_stack(void)
1675{
1676 if (x86_pmu.flush_branch_stack)
1677 x86_pmu.flush_branch_stack();
1678}
1679
1545static struct pmu pmu = { 1680static struct pmu pmu = {
1546 .pmu_enable = x86_pmu_enable, 1681 .pmu_enable = x86_pmu_enable,
1547 .pmu_disable = x86_pmu_disable, 1682 .pmu_disable = x86_pmu_disable,
1683
1684 .attr_groups = x86_pmu_attr_groups,
1548 1685
1549 .event_init = x86_pmu_event_init, 1686 .event_init = x86_pmu_event_init,
1550 1687
1551 .add = x86_pmu_add, 1688 .add = x86_pmu_add,
1552 .del = x86_pmu_del, 1689 .del = x86_pmu_del,
1553 .start = x86_pmu_start, 1690 .start = x86_pmu_start,
1554 .stop = x86_pmu_stop, 1691 .stop = x86_pmu_stop,
1555 .read = x86_pmu_read, 1692 .read = x86_pmu_read,
1556 1693
1557 .start_txn = x86_pmu_start_txn, 1694 .start_txn = x86_pmu_start_txn,
1558 .cancel_txn = x86_pmu_cancel_txn, 1695 .cancel_txn = x86_pmu_cancel_txn,
1559 .commit_txn = x86_pmu_commit_txn, 1696 .commit_txn = x86_pmu_commit_txn,
1697
1698 .event_idx = x86_pmu_event_idx,
1699 .flush_branch_stack = x86_pmu_flush_branch_stack,
1560}; 1700};
1561 1701
1702void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
1703{
1704 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
1705 return;
1706
1707 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
1708 return;
1709
1710 userpg->time_mult = this_cpu_read(cyc2ns);
1711 userpg->time_shift = CYC2NS_SCALE_FACTOR;
1712 userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
1713}
1714
1562/* 1715/*
1563 * callchain support 1716 * callchain support
1564 */ 1717 */
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 8944062f46e2..8484e77c211e 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -33,6 +33,7 @@ enum extra_reg_type {
33 33
34 EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ 34 EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
35 EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ 35 EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
36 EXTRA_REG_LBR = 2, /* lbr_select */
36 37
37 EXTRA_REG_MAX /* number of entries needed */ 38 EXTRA_REG_MAX /* number of entries needed */
38}; 39};
@@ -130,6 +131,8 @@ struct cpu_hw_events {
130 void *lbr_context; 131 void *lbr_context;
131 struct perf_branch_stack lbr_stack; 132 struct perf_branch_stack lbr_stack;
132 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; 133 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
134 struct er_account *lbr_sel;
135 u64 br_sel;
133 136
134 /* 137 /*
135 * Intel host/guest exclude bits 138 * Intel host/guest exclude bits
@@ -147,7 +150,9 @@ struct cpu_hw_events {
147 /* 150 /*
148 * AMD specific bits 151 * AMD specific bits
149 */ 152 */
150 struct amd_nb *amd_nb; 153 struct amd_nb *amd_nb;
154 /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
155 u64 perf_ctr_virt_mask;
151 156
152 void *kfree_on_online; 157 void *kfree_on_online;
153}; 158};
@@ -266,6 +271,29 @@ struct x86_pmu_quirk {
266 void (*func)(void); 271 void (*func)(void);
267}; 272};
268 273
274union x86_pmu_config {
275 struct {
276 u64 event:8,
277 umask:8,
278 usr:1,
279 os:1,
280 edge:1,
281 pc:1,
282 interrupt:1,
283 __reserved1:1,
284 en:1,
285 inv:1,
286 cmask:8,
287 event2:4,
288 __reserved2:4,
289 go:1,
290 ho:1;
291 } bits;
292 u64 value;
293};
294
295#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
296
269/* 297/*
270 * struct x86_pmu - generic x86 pmu 298 * struct x86_pmu - generic x86 pmu
271 */ 299 */
@@ -307,10 +335,19 @@ struct x86_pmu {
307 struct x86_pmu_quirk *quirks; 335 struct x86_pmu_quirk *quirks;
308 int perfctr_second_write; 336 int perfctr_second_write;
309 337
338 /*
339 * sysfs attrs
340 */
341 int attr_rdpmc;
342
343 /*
344 * CPU Hotplug hooks
345 */
310 int (*cpu_prepare)(int cpu); 346 int (*cpu_prepare)(int cpu);
311 void (*cpu_starting)(int cpu); 347 void (*cpu_starting)(int cpu);
312 void (*cpu_dying)(int cpu); 348 void (*cpu_dying)(int cpu);
313 void (*cpu_dead)(int cpu); 349 void (*cpu_dead)(int cpu);
350 void (*flush_branch_stack)(void);
314 351
315 /* 352 /*
316 * Intel Arch Perfmon v2+ 353 * Intel Arch Perfmon v2+
@@ -332,6 +369,8 @@ struct x86_pmu {
332 */ 369 */
333 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ 370 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
334 int lbr_nr; /* hardware stack size */ 371 int lbr_nr; /* hardware stack size */
372 u64 lbr_sel_mask; /* LBR_SELECT valid bits */
373 const int *lbr_sel_map; /* lbr_select mappings */
335 374
336 /* 375 /*
337 * Extra registers for events 376 * Extra registers for events
@@ -417,9 +456,11 @@ void x86_pmu_disable_all(void);
417static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, 456static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
418 u64 enable_mask) 457 u64 enable_mask)
419{ 458{
459 u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
460
420 if (hwc->extra_reg.reg) 461 if (hwc->extra_reg.reg)
421 wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); 462 wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
422 wrmsrl(hwc->config_base, hwc->config | enable_mask); 463 wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
423} 464}
424 465
425void x86_pmu_enable_all(int added); 466void x86_pmu_enable_all(int added);
@@ -443,6 +484,15 @@ extern struct event_constraint emptyconstraint;
443 484
444extern struct event_constraint unconstrained; 485extern struct event_constraint unconstrained;
445 486
487static inline bool kernel_ip(unsigned long ip)
488{
489#ifdef CONFIG_X86_32
490 return ip > PAGE_OFFSET;
491#else
492 return (long)ip < 0;
493#endif
494}
495
446#ifdef CONFIG_CPU_SUP_AMD 496#ifdef CONFIG_CPU_SUP_AMD
447 497
448int amd_pmu_init(void); 498int amd_pmu_init(void);
@@ -523,6 +573,10 @@ void intel_pmu_lbr_init_nhm(void);
523 573
524void intel_pmu_lbr_init_atom(void); 574void intel_pmu_lbr_init_atom(void);
525 575
576void intel_pmu_lbr_init_snb(void);
577
578int intel_pmu_setup_lbr_filter(struct perf_event *event);
579
526int p4_pmu_init(void); 580int p4_pmu_init(void);
527 581
528int p6_pmu_init(void); 582int p6_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 0397b23be8e9..dd002faff7a6 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,4 +1,5 @@
1#include <linux/perf_event.h> 1#include <linux/perf_event.h>
2#include <linux/export.h>
2#include <linux/types.h> 3#include <linux/types.h>
3#include <linux/init.h> 4#include <linux/init.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
@@ -138,6 +139,9 @@ static int amd_pmu_hw_config(struct perf_event *event)
138 if (ret) 139 if (ret)
139 return ret; 140 return ret;
140 141
142 if (has_branch_stack(event))
143 return -EOPNOTSUPP;
144
141 if (event->attr.exclude_host && event->attr.exclude_guest) 145 if (event->attr.exclude_host && event->attr.exclude_guest)
142 /* 146 /*
143 * When HO == GO == 1 the hardware treats that as GO == HO == 0 147 * When HO == GO == 1 the hardware treats that as GO == HO == 0
@@ -357,7 +361,9 @@ static void amd_pmu_cpu_starting(int cpu)
357 struct amd_nb *nb; 361 struct amd_nb *nb;
358 int i, nb_id; 362 int i, nb_id;
359 363
360 if (boot_cpu_data.x86_max_cores < 2) 364 cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
365
366 if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15)
361 return; 367 return;
362 368
363 nb_id = amd_get_nb_id(cpu); 369 nb_id = amd_get_nb_id(cpu);
@@ -587,9 +593,9 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {
587 .put_event_constraints = amd_put_event_constraints, 593 .put_event_constraints = amd_put_event_constraints,
588 594
589 .cpu_prepare = amd_pmu_cpu_prepare, 595 .cpu_prepare = amd_pmu_cpu_prepare,
590 .cpu_starting = amd_pmu_cpu_starting,
591 .cpu_dead = amd_pmu_cpu_dead, 596 .cpu_dead = amd_pmu_cpu_dead,
592#endif 597#endif
598 .cpu_starting = amd_pmu_cpu_starting,
593}; 599};
594 600
595__init int amd_pmu_init(void) 601__init int amd_pmu_init(void)
@@ -621,3 +627,33 @@ __init int amd_pmu_init(void)
621 627
622 return 0; 628 return 0;
623} 629}
630
631void amd_pmu_enable_virt(void)
632{
633 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
634
635 cpuc->perf_ctr_virt_mask = 0;
636
637 /* Reload all events */
638 x86_pmu_disable_all();
639 x86_pmu_enable_all(0);
640}
641EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
642
643void amd_pmu_disable_virt(void)
644{
645 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
646
647 /*
648 * We only mask out the Host-only bit so that host-only counting works
649 * when SVM is disabled. If someone sets up a guest-only counter when
650 * SVM is disabled the Guest-only bits still gets set and the counter
651 * will not count anything.
652 */
653 cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
654
655 /* Reload all events */
656 x86_pmu_disable_all();
657 x86_pmu_enable_all(0);
658}
659EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3bd37bdf1b8e..6a84e7f28f05 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -385,14 +385,15 @@ static __initconst const u64 westmere_hw_cache_event_ids
385#define NHM_LOCAL_DRAM (1 << 14) 385#define NHM_LOCAL_DRAM (1 << 14)
386#define NHM_NON_DRAM (1 << 15) 386#define NHM_NON_DRAM (1 << 15)
387 387
388#define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM) 388#define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
389#define NHM_REMOTE (NHM_REMOTE_DRAM)
389 390
390#define NHM_DMND_READ (NHM_DMND_DATA_RD) 391#define NHM_DMND_READ (NHM_DMND_DATA_RD)
391#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) 392#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
392#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) 393#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
393 394
394#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) 395#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
395#define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD) 396#define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
396#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) 397#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
397 398
398static __initconst const u64 nehalem_hw_cache_extra_regs 399static __initconst const u64 nehalem_hw_cache_extra_regs
@@ -416,16 +417,16 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
416 }, 417 },
417 [ C(NODE) ] = { 418 [ C(NODE) ] = {
418 [ C(OP_READ) ] = { 419 [ C(OP_READ) ] = {
419 [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM, 420 [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
420 [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM, 421 [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE,
421 }, 422 },
422 [ C(OP_WRITE) ] = { 423 [ C(OP_WRITE) ] = {
423 [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM, 424 [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
424 [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM, 425 [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE,
425 }, 426 },
426 [ C(OP_PREFETCH) ] = { 427 [ C(OP_PREFETCH) ] = {
427 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM, 428 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
428 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM, 429 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE,
429 }, 430 },
430 }, 431 },
431}; 432};
@@ -727,6 +728,19 @@ static __initconst const u64 atom_hw_cache_event_ids
727 }, 728 },
728}; 729};
729 730
731static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
732{
733 /* user explicitly requested branch sampling */
734 if (has_branch_stack(event))
735 return true;
736
737 /* implicit branch sampling to correct PEBS skid */
738 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
739 return true;
740
741 return false;
742}
743
730static void intel_pmu_disable_all(void) 744static void intel_pmu_disable_all(void)
731{ 745{
732 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 746 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -881,6 +895,13 @@ static void intel_pmu_disable_event(struct perf_event *event)
881 cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); 895 cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
882 cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); 896 cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
883 897
898 /*
899 * must disable before any actual event
900 * because any event may be combined with LBR
901 */
902 if (intel_pmu_needs_lbr_smpl(event))
903 intel_pmu_lbr_disable(event);
904
884 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 905 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
885 intel_pmu_disable_fixed(hwc); 906 intel_pmu_disable_fixed(hwc);
886 return; 907 return;
@@ -935,6 +956,12 @@ static void intel_pmu_enable_event(struct perf_event *event)
935 intel_pmu_enable_bts(hwc->config); 956 intel_pmu_enable_bts(hwc->config);
936 return; 957 return;
937 } 958 }
959 /*
960 * must enabled before any actual event
961 * because any event may be combined with LBR
962 */
963 if (intel_pmu_needs_lbr_smpl(event))
964 intel_pmu_lbr_enable(event);
938 965
939 if (event->attr.exclude_host) 966 if (event->attr.exclude_host)
940 cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); 967 cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
@@ -1057,6 +1084,9 @@ again:
1057 1084
1058 data.period = event->hw.last_period; 1085 data.period = event->hw.last_period;
1059 1086
1087 if (has_branch_stack(event))
1088 data.br_stack = &cpuc->lbr_stack;
1089
1060 if (perf_event_overflow(event, &data, regs)) 1090 if (perf_event_overflow(event, &data, regs))
1061 x86_pmu_stop(event, 0); 1091 x86_pmu_stop(event, 0);
1062 } 1092 }
@@ -1123,17 +1153,17 @@ static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
1123 */ 1153 */
1124static struct event_constraint * 1154static struct event_constraint *
1125__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, 1155__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
1126 struct perf_event *event) 1156 struct perf_event *event,
1157 struct hw_perf_event_extra *reg)
1127{ 1158{
1128 struct event_constraint *c = &emptyconstraint; 1159 struct event_constraint *c = &emptyconstraint;
1129 struct hw_perf_event_extra *reg = &event->hw.extra_reg;
1130 struct er_account *era; 1160 struct er_account *era;
1131 unsigned long flags; 1161 unsigned long flags;
1132 int orig_idx = reg->idx; 1162 int orig_idx = reg->idx;
1133 1163
1134 /* already allocated shared msr */ 1164 /* already allocated shared msr */
1135 if (reg->alloc) 1165 if (reg->alloc)
1136 return &unconstrained; 1166 return NULL; /* call x86_get_event_constraint() */
1137 1167
1138again: 1168again:
1139 era = &cpuc->shared_regs->regs[reg->idx]; 1169 era = &cpuc->shared_regs->regs[reg->idx];
@@ -1156,14 +1186,10 @@ again:
1156 reg->alloc = 1; 1186 reg->alloc = 1;
1157 1187
1158 /* 1188 /*
1159 * All events using extra_reg are unconstrained. 1189 * need to call x86_get_event_constraint()
1160 * Avoids calling x86_get_event_constraints() 1190 * to check if associated event has constraints
1161 *
1162 * Must revisit if extra_reg controlling events
1163 * ever have constraints. Worst case we go through
1164 * the regular event constraint table.
1165 */ 1191 */
1166 c = &unconstrained; 1192 c = NULL;
1167 } else if (intel_try_alt_er(event, orig_idx)) { 1193 } else if (intel_try_alt_er(event, orig_idx)) {
1168 raw_spin_unlock_irqrestore(&era->lock, flags); 1194 raw_spin_unlock_irqrestore(&era->lock, flags);
1169 goto again; 1195 goto again;
@@ -1200,11 +1226,23 @@ static struct event_constraint *
1200intel_shared_regs_constraints(struct cpu_hw_events *cpuc, 1226intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
1201 struct perf_event *event) 1227 struct perf_event *event)
1202{ 1228{
1203 struct event_constraint *c = NULL; 1229 struct event_constraint *c = NULL, *d;
1204 1230 struct hw_perf_event_extra *xreg, *breg;
1205 if (event->hw.extra_reg.idx != EXTRA_REG_NONE) 1231
1206 c = __intel_shared_reg_get_constraints(cpuc, event); 1232 xreg = &event->hw.extra_reg;
1207 1233 if (xreg->idx != EXTRA_REG_NONE) {
1234 c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
1235 if (c == &emptyconstraint)
1236 return c;
1237 }
1238 breg = &event->hw.branch_reg;
1239 if (breg->idx != EXTRA_REG_NONE) {
1240 d = __intel_shared_reg_get_constraints(cpuc, event, breg);
1241 if (d == &emptyconstraint) {
1242 __intel_shared_reg_put_constraints(cpuc, xreg);
1243 c = d;
1244 }
1245 }
1208 return c; 1246 return c;
1209} 1247}
1210 1248
@@ -1252,6 +1290,10 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
1252 reg = &event->hw.extra_reg; 1290 reg = &event->hw.extra_reg;
1253 if (reg->idx != EXTRA_REG_NONE) 1291 if (reg->idx != EXTRA_REG_NONE)
1254 __intel_shared_reg_put_constraints(cpuc, reg); 1292 __intel_shared_reg_put_constraints(cpuc, reg);
1293
1294 reg = &event->hw.branch_reg;
1295 if (reg->idx != EXTRA_REG_NONE)
1296 __intel_shared_reg_put_constraints(cpuc, reg);
1255} 1297}
1256 1298
1257static void intel_put_event_constraints(struct cpu_hw_events *cpuc, 1299static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
@@ -1287,12 +1329,19 @@ static int intel_pmu_hw_config(struct perf_event *event)
1287 * 1329 *
1288 * Thereby we gain a PEBS capable cycle counter. 1330 * Thereby we gain a PEBS capable cycle counter.
1289 */ 1331 */
1290 u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */ 1332 u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
1333
1291 1334
1292 alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); 1335 alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
1293 event->hw.config = alt_config; 1336 event->hw.config = alt_config;
1294 } 1337 }
1295 1338
1339 if (intel_pmu_needs_lbr_smpl(event)) {
1340 ret = intel_pmu_setup_lbr_filter(event);
1341 if (ret)
1342 return ret;
1343 }
1344
1296 if (event->attr.type != PERF_TYPE_RAW) 1345 if (event->attr.type != PERF_TYPE_RAW)
1297 return 0; 1346 return 0;
1298 1347
@@ -1431,7 +1480,7 @@ static int intel_pmu_cpu_prepare(int cpu)
1431{ 1480{
1432 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1481 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1433 1482
1434 if (!x86_pmu.extra_regs) 1483 if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
1435 return NOTIFY_OK; 1484 return NOTIFY_OK;
1436 1485
1437 cpuc->shared_regs = allocate_shared_regs(cpu); 1486 cpuc->shared_regs = allocate_shared_regs(cpu);
@@ -1453,22 +1502,28 @@ static void intel_pmu_cpu_starting(int cpu)
1453 */ 1502 */
1454 intel_pmu_lbr_reset(); 1503 intel_pmu_lbr_reset();
1455 1504
1456 if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING)) 1505 cpuc->lbr_sel = NULL;
1506
1507 if (!cpuc->shared_regs)
1457 return; 1508 return;
1458 1509
1459 for_each_cpu(i, topology_thread_cpumask(cpu)) { 1510 if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
1460 struct intel_shared_regs *pc; 1511 for_each_cpu(i, topology_thread_cpumask(cpu)) {
1512 struct intel_shared_regs *pc;
1461 1513
1462 pc = per_cpu(cpu_hw_events, i).shared_regs; 1514 pc = per_cpu(cpu_hw_events, i).shared_regs;
1463 if (pc && pc->core_id == core_id) { 1515 if (pc && pc->core_id == core_id) {
1464 cpuc->kfree_on_online = cpuc->shared_regs; 1516 cpuc->kfree_on_online = cpuc->shared_regs;
1465 cpuc->shared_regs = pc; 1517 cpuc->shared_regs = pc;
1466 break; 1518 break;
1519 }
1467 } 1520 }
1521 cpuc->shared_regs->core_id = core_id;
1522 cpuc->shared_regs->refcnt++;
1468 } 1523 }
1469 1524
1470 cpuc->shared_regs->core_id = core_id; 1525 if (x86_pmu.lbr_sel_map)
1471 cpuc->shared_regs->refcnt++; 1526 cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
1472} 1527}
1473 1528
1474static void intel_pmu_cpu_dying(int cpu) 1529static void intel_pmu_cpu_dying(int cpu)
@@ -1486,6 +1541,18 @@ static void intel_pmu_cpu_dying(int cpu)
1486 fini_debug_store_on_cpu(cpu); 1541 fini_debug_store_on_cpu(cpu);
1487} 1542}
1488 1543
1544static void intel_pmu_flush_branch_stack(void)
1545{
1546 /*
1547 * Intel LBR does not tag entries with the
1548 * PID of the current task, then we need to
1549 * flush it on ctxsw
1550 * For now, we simply reset it
1551 */
1552 if (x86_pmu.lbr_nr)
1553 intel_pmu_lbr_reset();
1554}
1555
1489static __initconst const struct x86_pmu intel_pmu = { 1556static __initconst const struct x86_pmu intel_pmu = {
1490 .name = "Intel", 1557 .name = "Intel",
1491 .handle_irq = intel_pmu_handle_irq, 1558 .handle_irq = intel_pmu_handle_irq,
@@ -1513,6 +1580,7 @@ static __initconst const struct x86_pmu intel_pmu = {
1513 .cpu_starting = intel_pmu_cpu_starting, 1580 .cpu_starting = intel_pmu_cpu_starting,
1514 .cpu_dying = intel_pmu_cpu_dying, 1581 .cpu_dying = intel_pmu_cpu_dying,
1515 .guest_get_msrs = intel_guest_get_msrs, 1582 .guest_get_msrs = intel_guest_get_msrs,
1583 .flush_branch_stack = intel_pmu_flush_branch_stack,
1516}; 1584};
1517 1585
1518static __init void intel_clovertown_quirk(void) 1586static __init void intel_clovertown_quirk(void)
@@ -1689,9 +1757,11 @@ __init int intel_pmu_init(void)
1689 x86_pmu.extra_regs = intel_nehalem_extra_regs; 1757 x86_pmu.extra_regs = intel_nehalem_extra_regs;
1690 1758
1691 /* UOPS_ISSUED.STALLED_CYCLES */ 1759 /* UOPS_ISSUED.STALLED_CYCLES */
1692 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; 1760 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
1761 X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
1693 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ 1762 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1694 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; 1763 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
1764 X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
1695 1765
1696 x86_add_quirk(intel_nehalem_quirk); 1766 x86_add_quirk(intel_nehalem_quirk);
1697 1767
@@ -1726,9 +1796,11 @@ __init int intel_pmu_init(void)
1726 x86_pmu.er_flags |= ERF_HAS_RSP_1; 1796 x86_pmu.er_flags |= ERF_HAS_RSP_1;
1727 1797
1728 /* UOPS_ISSUED.STALLED_CYCLES */ 1798 /* UOPS_ISSUED.STALLED_CYCLES */
1729 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; 1799 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
1800 X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
1730 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ 1801 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1731 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; 1802 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
1803 X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
1732 1804
1733 pr_cont("Westmere events, "); 1805 pr_cont("Westmere events, ");
1734 break; 1806 break;
@@ -1739,7 +1811,7 @@ __init int intel_pmu_init(void)
1739 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 1811 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
1740 sizeof(hw_cache_event_ids)); 1812 sizeof(hw_cache_event_ids));
1741 1813
1742 intel_pmu_lbr_init_nhm(); 1814 intel_pmu_lbr_init_snb();
1743 1815
1744 x86_pmu.event_constraints = intel_snb_event_constraints; 1816 x86_pmu.event_constraints = intel_snb_event_constraints;
1745 x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; 1817 x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
@@ -1749,9 +1821,11 @@ __init int intel_pmu_init(void)
1749 x86_pmu.er_flags |= ERF_NO_HT_SHARING; 1821 x86_pmu.er_flags |= ERF_NO_HT_SHARING;
1750 1822
1751 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ 1823 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
1752 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; 1824 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
1825 X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
1753 /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ 1826 /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
1754 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1; 1827 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
1828 X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
1755 1829
1756 pr_cont("SandyBridge events, "); 1830 pr_cont("SandyBridge events, ");
1757 break; 1831 break;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 73da6b64f5b7..7f64df19e7dd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -3,6 +3,7 @@
3#include <linux/slab.h> 3#include <linux/slab.h>
4 4
5#include <asm/perf_event.h> 5#include <asm/perf_event.h>
6#include <asm/insn.h>
6 7
7#include "perf_event.h" 8#include "perf_event.h"
8 9
@@ -439,10 +440,6 @@ void intel_pmu_pebs_enable(struct perf_event *event)
439 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; 440 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
440 441
441 cpuc->pebs_enabled |= 1ULL << hwc->idx; 442 cpuc->pebs_enabled |= 1ULL << hwc->idx;
442 WARN_ON_ONCE(cpuc->enabled);
443
444 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
445 intel_pmu_lbr_enable(event);
446} 443}
447 444
448void intel_pmu_pebs_disable(struct perf_event *event) 445void intel_pmu_pebs_disable(struct perf_event *event)
@@ -455,9 +452,6 @@ void intel_pmu_pebs_disable(struct perf_event *event)
455 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); 452 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
456 453
457 hwc->config |= ARCH_PERFMON_EVENTSEL_INT; 454 hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
458
459 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
460 intel_pmu_lbr_disable(event);
461} 455}
462 456
463void intel_pmu_pebs_enable_all(void) 457void intel_pmu_pebs_enable_all(void)
@@ -476,17 +470,6 @@ void intel_pmu_pebs_disable_all(void)
476 wrmsrl(MSR_IA32_PEBS_ENABLE, 0); 470 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
477} 471}
478 472
479#include <asm/insn.h>
480
481static inline bool kernel_ip(unsigned long ip)
482{
483#ifdef CONFIG_X86_32
484 return ip > PAGE_OFFSET;
485#else
486 return (long)ip < 0;
487#endif
488}
489
490static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) 473static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
491{ 474{
492 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 475 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -573,6 +556,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
573 * both formats and we don't use the other fields in this 556 * both formats and we don't use the other fields in this
574 * routine. 557 * routine.
575 */ 558 */
559 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
576 struct pebs_record_core *pebs = __pebs; 560 struct pebs_record_core *pebs = __pebs;
577 struct perf_sample_data data; 561 struct perf_sample_data data;
578 struct pt_regs regs; 562 struct pt_regs regs;
@@ -603,6 +587,9 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
603 else 587 else
604 regs.flags &= ~PERF_EFLAGS_EXACT; 588 regs.flags &= ~PERF_EFLAGS_EXACT;
605 589
590 if (has_branch_stack(event))
591 data.br_stack = &cpuc->lbr_stack;
592
606 if (perf_event_overflow(event, &data, &regs)) 593 if (perf_event_overflow(event, &data, &regs))
607 x86_pmu_stop(event, 0); 594 x86_pmu_stop(event, 0);
608} 595}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 3fab3de3ce96..520b4265fcd2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -3,6 +3,7 @@
3 3
4#include <asm/perf_event.h> 4#include <asm/perf_event.h>
5#include <asm/msr.h> 5#include <asm/msr.h>
6#include <asm/insn.h>
6 7
7#include "perf_event.h" 8#include "perf_event.h"
8 9
@@ -14,6 +15,100 @@ enum {
14}; 15};
15 16
16/* 17/*
18 * Intel LBR_SELECT bits
19 * Intel Vol3a, April 2011, Section 16.7 Table 16-10
20 *
21 * Hardware branch filter (not available on all CPUs)
22 */
23#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */
24#define LBR_USER_BIT 1 /* do not capture at ring > 0 */
25#define LBR_JCC_BIT 2 /* do not capture conditional branches */
26#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */
27#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */
28#define LBR_RETURN_BIT 5 /* do not capture near returns */
29#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
30#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
31#define LBR_FAR_BIT 8 /* do not capture far branches */
32
33#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
34#define LBR_USER (1 << LBR_USER_BIT)
35#define LBR_JCC (1 << LBR_JCC_BIT)
36#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT)
37#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT)
38#define LBR_RETURN (1 << LBR_RETURN_BIT)
39#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
40#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
41#define LBR_FAR (1 << LBR_FAR_BIT)
42
43#define LBR_PLM (LBR_KERNEL | LBR_USER)
44
45#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */
46#define LBR_NOT_SUPP -1 /* LBR filter not supported */
47#define LBR_IGN 0 /* ignored */
48
49#define LBR_ANY \
50 (LBR_JCC |\
51 LBR_REL_CALL |\
52 LBR_IND_CALL |\
53 LBR_RETURN |\
54 LBR_REL_JMP |\
55 LBR_IND_JMP |\
56 LBR_FAR)
57
58#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
59
60#define for_each_branch_sample_type(x) \
61 for ((x) = PERF_SAMPLE_BRANCH_USER; \
62 (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
63
64/*
65 * x86control flow change classification
66 * x86control flow changes include branches, interrupts, traps, faults
67 */
68enum {
69 X86_BR_NONE = 0, /* unknown */
70
71 X86_BR_USER = 1 << 0, /* branch target is user */
72 X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
73
74 X86_BR_CALL = 1 << 2, /* call */
75 X86_BR_RET = 1 << 3, /* return */
76 X86_BR_SYSCALL = 1 << 4, /* syscall */
77 X86_BR_SYSRET = 1 << 5, /* syscall return */
78 X86_BR_INT = 1 << 6, /* sw interrupt */
79 X86_BR_IRET = 1 << 7, /* return from interrupt */
80 X86_BR_JCC = 1 << 8, /* conditional */
81 X86_BR_JMP = 1 << 9, /* jump */
82 X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
83 X86_BR_IND_CALL = 1 << 11,/* indirect calls */
84};
85
86#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
87
88#define X86_BR_ANY \
89 (X86_BR_CALL |\
90 X86_BR_RET |\
91 X86_BR_SYSCALL |\
92 X86_BR_SYSRET |\
93 X86_BR_INT |\
94 X86_BR_IRET |\
95 X86_BR_JCC |\
96 X86_BR_JMP |\
97 X86_BR_IRQ |\
98 X86_BR_IND_CALL)
99
100#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
101
102#define X86_BR_ANY_CALL \
103 (X86_BR_CALL |\
104 X86_BR_IND_CALL |\
105 X86_BR_SYSCALL |\
106 X86_BR_IRQ |\
107 X86_BR_INT)
108
109static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
110
111/*
17 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI 112 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
18 * otherwise it becomes near impossible to get a reliable stack. 113 * otherwise it becomes near impossible to get a reliable stack.
19 */ 114 */
@@ -21,6 +116,10 @@ enum {
21static void __intel_pmu_lbr_enable(void) 116static void __intel_pmu_lbr_enable(void)
22{ 117{
23 u64 debugctl; 118 u64 debugctl;
119 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
120
121 if (cpuc->lbr_sel)
122 wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
24 123
25 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 124 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
26 debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); 125 debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
@@ -72,17 +171,15 @@ void intel_pmu_lbr_enable(struct perf_event *event)
72 if (!x86_pmu.lbr_nr) 171 if (!x86_pmu.lbr_nr)
73 return; 172 return;
74 173
75 WARN_ON_ONCE(cpuc->enabled);
76
77 /* 174 /*
78 * Reset the LBR stack if we changed task context to 175 * Reset the LBR stack if we changed task context to
79 * avoid data leaks. 176 * avoid data leaks.
80 */ 177 */
81
82 if (event->ctx->task && cpuc->lbr_context != event->ctx) { 178 if (event->ctx->task && cpuc->lbr_context != event->ctx) {
83 intel_pmu_lbr_reset(); 179 intel_pmu_lbr_reset();
84 cpuc->lbr_context = event->ctx; 180 cpuc->lbr_context = event->ctx;
85 } 181 }
182 cpuc->br_sel = event->hw.branch_reg.reg;
86 183
87 cpuc->lbr_users++; 184 cpuc->lbr_users++;
88} 185}
@@ -97,8 +194,11 @@ void intel_pmu_lbr_disable(struct perf_event *event)
97 cpuc->lbr_users--; 194 cpuc->lbr_users--;
98 WARN_ON_ONCE(cpuc->lbr_users < 0); 195 WARN_ON_ONCE(cpuc->lbr_users < 0);
99 196
100 if (cpuc->enabled && !cpuc->lbr_users) 197 if (cpuc->enabled && !cpuc->lbr_users) {
101 __intel_pmu_lbr_disable(); 198 __intel_pmu_lbr_disable();
199 /* avoid stale pointer */
200 cpuc->lbr_context = NULL;
201 }
102} 202}
103 203
104void intel_pmu_lbr_enable_all(void) 204void intel_pmu_lbr_enable_all(void)
@@ -117,6 +217,9 @@ void intel_pmu_lbr_disable_all(void)
117 __intel_pmu_lbr_disable(); 217 __intel_pmu_lbr_disable();
118} 218}
119 219
220/*
221 * TOS = most recently recorded branch
222 */
120static inline u64 intel_pmu_lbr_tos(void) 223static inline u64 intel_pmu_lbr_tos(void)
121{ 224{
122 u64 tos; 225 u64 tos;
@@ -144,15 +247,15 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
144 247
145 rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); 248 rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
146 249
147 cpuc->lbr_entries[i].from = msr_lastbranch.from; 250 cpuc->lbr_entries[i].from = msr_lastbranch.from;
148 cpuc->lbr_entries[i].to = msr_lastbranch.to; 251 cpuc->lbr_entries[i].to = msr_lastbranch.to;
149 cpuc->lbr_entries[i].flags = 0; 252 cpuc->lbr_entries[i].mispred = 0;
253 cpuc->lbr_entries[i].predicted = 0;
254 cpuc->lbr_entries[i].reserved = 0;
150 } 255 }
151 cpuc->lbr_stack.nr = i; 256 cpuc->lbr_stack.nr = i;
152} 257}
153 258
154#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
155
156/* 259/*
157 * Due to lack of segmentation in Linux the effective address (offset) 260 * Due to lack of segmentation in Linux the effective address (offset)
158 * is the same as the linear address, allowing us to merge the LIP and EIP 261 * is the same as the linear address, allowing us to merge the LIP and EIP
@@ -167,19 +270,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
167 270
168 for (i = 0; i < x86_pmu.lbr_nr; i++) { 271 for (i = 0; i < x86_pmu.lbr_nr; i++) {
169 unsigned long lbr_idx = (tos - i) & mask; 272 unsigned long lbr_idx = (tos - i) & mask;
170 u64 from, to, flags = 0; 273 u64 from, to, mis = 0, pred = 0;
171 274
172 rdmsrl(x86_pmu.lbr_from + lbr_idx, from); 275 rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
173 rdmsrl(x86_pmu.lbr_to + lbr_idx, to); 276 rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
174 277
175 if (lbr_format == LBR_FORMAT_EIP_FLAGS) { 278 if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
176 flags = !!(from & LBR_FROM_FLAG_MISPRED); 279 mis = !!(from & LBR_FROM_FLAG_MISPRED);
280 pred = !mis;
177 from = (u64)((((s64)from) << 1) >> 1); 281 from = (u64)((((s64)from) << 1) >> 1);
178 } 282 }
179 283
180 cpuc->lbr_entries[i].from = from; 284 cpuc->lbr_entries[i].from = from;
181 cpuc->lbr_entries[i].to = to; 285 cpuc->lbr_entries[i].to = to;
182 cpuc->lbr_entries[i].flags = flags; 286 cpuc->lbr_entries[i].mispred = mis;
287 cpuc->lbr_entries[i].predicted = pred;
288 cpuc->lbr_entries[i].reserved = 0;
183 } 289 }
184 cpuc->lbr_stack.nr = i; 290 cpuc->lbr_stack.nr = i;
185} 291}
@@ -195,28 +301,404 @@ void intel_pmu_lbr_read(void)
195 intel_pmu_lbr_read_32(cpuc); 301 intel_pmu_lbr_read_32(cpuc);
196 else 302 else
197 intel_pmu_lbr_read_64(cpuc); 303 intel_pmu_lbr_read_64(cpuc);
304
305 intel_pmu_lbr_filter(cpuc);
306}
307
308/*
309 * SW filter is used:
310 * - in case there is no HW filter
311 * - in case the HW filter has errata or limitations
312 */
313static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
314{
315 u64 br_type = event->attr.branch_sample_type;
316 int mask = 0;
317
318 if (br_type & PERF_SAMPLE_BRANCH_USER)
319 mask |= X86_BR_USER;
320
321 if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
322 mask |= X86_BR_KERNEL;
323
324 /* we ignore BRANCH_HV here */
325
326 if (br_type & PERF_SAMPLE_BRANCH_ANY)
327 mask |= X86_BR_ANY;
328
329 if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
330 mask |= X86_BR_ANY_CALL;
331
332 if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
333 mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
334
335 if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
336 mask |= X86_BR_IND_CALL;
337 /*
338 * stash actual user request into reg, it may
339 * be used by fixup code for some CPU
340 */
341 event->hw.branch_reg.reg = mask;
342}
343
344/*
345 * setup the HW LBR filter
346 * Used only when available, may not be enough to disambiguate
347 * all branches, may need the help of the SW filter
348 */
349static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
350{
351 struct hw_perf_event_extra *reg;
352 u64 br_type = event->attr.branch_sample_type;
353 u64 mask = 0, m;
354 u64 v;
355
356 for_each_branch_sample_type(m) {
357 if (!(br_type & m))
358 continue;
359
360 v = x86_pmu.lbr_sel_map[m];
361 if (v == LBR_NOT_SUPP)
362 return -EOPNOTSUPP;
363
364 if (v != LBR_IGN)
365 mask |= v;
366 }
367 reg = &event->hw.branch_reg;
368 reg->idx = EXTRA_REG_LBR;
369
370 /* LBR_SELECT operates in suppress mode so invert mask */
371 reg->config = ~mask & x86_pmu.lbr_sel_mask;
372
373 return 0;
374}
375
376int intel_pmu_setup_lbr_filter(struct perf_event *event)
377{
378 int ret = 0;
379
380 /*
381 * no LBR on this PMU
382 */
383 if (!x86_pmu.lbr_nr)
384 return -EOPNOTSUPP;
385
386 /*
387 * setup SW LBR filter
388 */
389 intel_pmu_setup_sw_lbr_filter(event);
390
391 /*
392 * setup HW LBR filter, if any
393 */
394 if (x86_pmu.lbr_sel_map)
395 ret = intel_pmu_setup_hw_lbr_filter(event);
396
397 return ret;
398}
399
400/*
401 * return the type of control flow change at address "from"
402 * intruction is not necessarily a branch (in case of interrupt).
403 *
404 * The branch type returned also includes the priv level of the
405 * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
406 *
407 * If a branch type is unknown OR the instruction cannot be
408 * decoded (e.g., text page not present), then X86_BR_NONE is
409 * returned.
410 */
411static int branch_type(unsigned long from, unsigned long to)
412{
413 struct insn insn;
414 void *addr;
415 int bytes, size = MAX_INSN_SIZE;
416 int ret = X86_BR_NONE;
417 int ext, to_plm, from_plm;
418 u8 buf[MAX_INSN_SIZE];
419 int is64 = 0;
420
421 to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
422 from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
423
424 /*
425 * maybe zero if lbr did not fill up after a reset by the time
426 * we get a PMU interrupt
427 */
428 if (from == 0 || to == 0)
429 return X86_BR_NONE;
430
431 if (from_plm == X86_BR_USER) {
432 /*
433 * can happen if measuring at the user level only
434 * and we interrupt in a kernel thread, e.g., idle.
435 */
436 if (!current->mm)
437 return X86_BR_NONE;
438
439 /* may fail if text not present */
440 bytes = copy_from_user_nmi(buf, (void __user *)from, size);
441 if (bytes != size)
442 return X86_BR_NONE;
443
444 addr = buf;
445 } else
446 addr = (void *)from;
447
448 /*
449 * decoder needs to know the ABI especially
450 * on 64-bit systems running 32-bit apps
451 */
452#ifdef CONFIG_X86_64
453 is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
454#endif
455 insn_init(&insn, addr, is64);
456 insn_get_opcode(&insn);
457
458 switch (insn.opcode.bytes[0]) {
459 case 0xf:
460 switch (insn.opcode.bytes[1]) {
461 case 0x05: /* syscall */
462 case 0x34: /* sysenter */
463 ret = X86_BR_SYSCALL;
464 break;
465 case 0x07: /* sysret */
466 case 0x35: /* sysexit */
467 ret = X86_BR_SYSRET;
468 break;
469 case 0x80 ... 0x8f: /* conditional */
470 ret = X86_BR_JCC;
471 break;
472 default:
473 ret = X86_BR_NONE;
474 }
475 break;
476 case 0x70 ... 0x7f: /* conditional */
477 ret = X86_BR_JCC;
478 break;
479 case 0xc2: /* near ret */
480 case 0xc3: /* near ret */
481 case 0xca: /* far ret */
482 case 0xcb: /* far ret */
483 ret = X86_BR_RET;
484 break;
485 case 0xcf: /* iret */
486 ret = X86_BR_IRET;
487 break;
488 case 0xcc ... 0xce: /* int */
489 ret = X86_BR_INT;
490 break;
491 case 0xe8: /* call near rel */
492 case 0x9a: /* call far absolute */
493 ret = X86_BR_CALL;
494 break;
495 case 0xe0 ... 0xe3: /* loop jmp */
496 ret = X86_BR_JCC;
497 break;
498 case 0xe9 ... 0xeb: /* jmp */
499 ret = X86_BR_JMP;
500 break;
501 case 0xff: /* call near absolute, call far absolute ind */
502 insn_get_modrm(&insn);
503 ext = (insn.modrm.bytes[0] >> 3) & 0x7;
504 switch (ext) {
505 case 2: /* near ind call */
506 case 3: /* far ind call */
507 ret = X86_BR_IND_CALL;
508 break;
509 case 4:
510 case 5:
511 ret = X86_BR_JMP;
512 break;
513 }
514 break;
515 default:
516 ret = X86_BR_NONE;
517 }
518 /*
519 * interrupts, traps, faults (and thus ring transition) may
520 * occur on any instructions. Thus, to classify them correctly,
521 * we need to first look at the from and to priv levels. If they
522 * are different and to is in the kernel, then it indicates
523 * a ring transition. If the from instruction is not a ring
524 * transition instr (syscall, systenter, int), then it means
525 * it was a irq, trap or fault.
526 *
527 * we have no way of detecting kernel to kernel faults.
528 */
529 if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
530 && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
531 ret = X86_BR_IRQ;
532
533 /*
534 * branch priv level determined by target as
535 * is done by HW when LBR_SELECT is implemented
536 */
537 if (ret != X86_BR_NONE)
538 ret |= to_plm;
539
540 return ret;
541}
542
543/*
544 * implement actual branch filter based on user demand.
545 * Hardware may not exactly satisfy that request, thus
546 * we need to inspect opcodes. Mismatched branches are
547 * discarded. Therefore, the number of branches returned
548 * in PERF_SAMPLE_BRANCH_STACK sample may vary.
549 */
550static void
551intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
552{
553 u64 from, to;
554 int br_sel = cpuc->br_sel;
555 int i, j, type;
556 bool compress = false;
557
558 /* if sampling all branches, then nothing to filter */
559 if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
560 return;
561
562 for (i = 0; i < cpuc->lbr_stack.nr; i++) {
563
564 from = cpuc->lbr_entries[i].from;
565 to = cpuc->lbr_entries[i].to;
566
567 type = branch_type(from, to);
568
569 /* if type does not correspond, then discard */
570 if (type == X86_BR_NONE || (br_sel & type) != type) {
571 cpuc->lbr_entries[i].from = 0;
572 compress = true;
573 }
574 }
575
576 if (!compress)
577 return;
578
579 /* remove all entries with from=0 */
580 for (i = 0; i < cpuc->lbr_stack.nr; ) {
581 if (!cpuc->lbr_entries[i].from) {
582 j = i;
583 while (++j < cpuc->lbr_stack.nr)
584 cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
585 cpuc->lbr_stack.nr--;
586 if (!cpuc->lbr_entries[i].from)
587 continue;
588 }
589 i++;
590 }
198} 591}
199 592
593/*
594 * Map interface branch filters onto LBR filters
595 */
596static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
597 [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
598 [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
599 [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
600 [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
601 [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
602 | LBR_IND_JMP | LBR_FAR,
603 /*
604 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
605 */
606 [PERF_SAMPLE_BRANCH_ANY_CALL] =
607 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
608 /*
609 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
610 */
611 [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
612};
613
614static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
615 [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
616 [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
617 [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
618 [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
619 [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
620 [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
621 | LBR_FAR,
622 [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
623};
624
625/* core */
200void intel_pmu_lbr_init_core(void) 626void intel_pmu_lbr_init_core(void)
201{ 627{
202 x86_pmu.lbr_nr = 4; 628 x86_pmu.lbr_nr = 4;
203 x86_pmu.lbr_tos = 0x01c9; 629 x86_pmu.lbr_tos = MSR_LBR_TOS;
204 x86_pmu.lbr_from = 0x40; 630 x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
205 x86_pmu.lbr_to = 0x60; 631 x86_pmu.lbr_to = MSR_LBR_CORE_TO;
632
633 /*
634 * SW branch filter usage:
635 * - compensate for lack of HW filter
636 */
637 pr_cont("4-deep LBR, ");
206} 638}
207 639
640/* nehalem/westmere */
208void intel_pmu_lbr_init_nhm(void) 641void intel_pmu_lbr_init_nhm(void)
209{ 642{
210 x86_pmu.lbr_nr = 16; 643 x86_pmu.lbr_nr = 16;
211 x86_pmu.lbr_tos = 0x01c9; 644 x86_pmu.lbr_tos = MSR_LBR_TOS;
212 x86_pmu.lbr_from = 0x680; 645 x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
213 x86_pmu.lbr_to = 0x6c0; 646 x86_pmu.lbr_to = MSR_LBR_NHM_TO;
647
648 x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
649 x86_pmu.lbr_sel_map = nhm_lbr_sel_map;
650
651 /*
652 * SW branch filter usage:
653 * - workaround LBR_SEL errata (see above)
654 * - support syscall, sysret capture.
655 * That requires LBR_FAR but that means far
656 * jmp need to be filtered out
657 */
658 pr_cont("16-deep LBR, ");
659}
660
661/* sandy bridge */
662void intel_pmu_lbr_init_snb(void)
663{
664 x86_pmu.lbr_nr = 16;
665 x86_pmu.lbr_tos = MSR_LBR_TOS;
666 x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
667 x86_pmu.lbr_to = MSR_LBR_NHM_TO;
668
669 x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
670 x86_pmu.lbr_sel_map = snb_lbr_sel_map;
671
672 /*
673 * SW branch filter usage:
674 * - support syscall, sysret capture.
675 * That requires LBR_FAR but that means far
676 * jmp need to be filtered out
677 */
678 pr_cont("16-deep LBR, ");
214} 679}
215 680
681/* atom */
216void intel_pmu_lbr_init_atom(void) 682void intel_pmu_lbr_init_atom(void)
217{ 683{
684 /*
685 * only models starting at stepping 10 seems
686 * to have an operational LBR which can freeze
687 * on PMU interrupt
688 */
689 if (boot_cpu_data.x86_mask < 10) {
690 pr_cont("LBR disabled due to erratum");
691 return;
692 }
693
218 x86_pmu.lbr_nr = 8; 694 x86_pmu.lbr_nr = 8;
219 x86_pmu.lbr_tos = 0x01c9; 695 x86_pmu.lbr_tos = MSR_LBR_TOS;
220 x86_pmu.lbr_from = 0x40; 696 x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
221 x86_pmu.lbr_to = 0x60; 697 x86_pmu.lbr_to = MSR_LBR_CORE_TO;
698
699 /*
700 * SW branch filter usage:
701 * - compensate for lack of HW filter
702 */
703 pr_cont("8-deep LBR, ");
222} 704}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index c7f64e6f537a..addf9e82a7f2 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -40,6 +40,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
40 { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, 40 { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
41 { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, 41 { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
42 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, 42 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
43 { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
43 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, 44 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
44 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, 45 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
45 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, 46 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 642f75a68cd5..11891ca7b716 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -62,16 +62,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
62 62
63 if (!userbuf) { 63 if (!userbuf) {
64 memcpy(buf, (vaddr + offset), csize); 64 memcpy(buf, (vaddr + offset), csize);
65 kunmap_atomic(vaddr, KM_PTE0); 65 kunmap_atomic(vaddr);
66 } else { 66 } else {
67 if (!kdump_buf_page) { 67 if (!kdump_buf_page) {
68 printk(KERN_WARNING "Kdump: Kdump buffer page not" 68 printk(KERN_WARNING "Kdump: Kdump buffer page not"
69 " allocated\n"); 69 " allocated\n");
70 kunmap_atomic(vaddr, KM_PTE0); 70 kunmap_atomic(vaddr);
71 return -EFAULT; 71 return -EFAULT;
72 } 72 }
73 copy_page(kdump_buf_page, vaddr); 73 copy_page(kdump_buf_page, vaddr);
74 kunmap_atomic(vaddr, KM_PTE0); 74 kunmap_atomic(vaddr);
75 if (copy_to_user(buf, (kdump_buf_page + offset), csize)) 75 if (copy_to_user(buf, (kdump_buf_page + offset), csize))
76 return -EFAULT; 76 return -EFAULT;
77 } 77 }
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 52821799a702..3ae2ced4a874 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -4,6 +4,7 @@
4#include <linux/bootmem.h> 4#include <linux/bootmem.h>
5#include <linux/export.h> 5#include <linux/export.h>
6#include <linux/io.h> 6#include <linux/io.h>
7#include <linux/irqdomain.h>
7#include <linux/interrupt.h> 8#include <linux/interrupt.h>
8#include <linux/list.h> 9#include <linux/list.h>
9#include <linux/of.h> 10#include <linux/of.h>
@@ -17,64 +18,14 @@
17#include <linux/initrd.h> 18#include <linux/initrd.h>
18 19
19#include <asm/hpet.h> 20#include <asm/hpet.h>
20#include <asm/irq_controller.h>
21#include <asm/apic.h> 21#include <asm/apic.h>
22#include <asm/pci_x86.h> 22#include <asm/pci_x86.h>
23 23
24__initdata u64 initial_dtb; 24__initdata u64 initial_dtb;
25char __initdata cmd_line[COMMAND_LINE_SIZE]; 25char __initdata cmd_line[COMMAND_LINE_SIZE];
26static LIST_HEAD(irq_domains);
27static DEFINE_RAW_SPINLOCK(big_irq_lock);
28 26
29int __initdata of_ioapic; 27int __initdata of_ioapic;
30 28
31#ifdef CONFIG_X86_IO_APIC
32static void add_interrupt_host(struct irq_domain *ih)
33{
34 unsigned long flags;
35
36 raw_spin_lock_irqsave(&big_irq_lock, flags);
37 list_add(&ih->l, &irq_domains);
38 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
39}
40#endif
41
42static struct irq_domain *get_ih_from_node(struct device_node *controller)
43{
44 struct irq_domain *ih, *found = NULL;
45 unsigned long flags;
46
47 raw_spin_lock_irqsave(&big_irq_lock, flags);
48 list_for_each_entry(ih, &irq_domains, l) {
49 if (ih->controller == controller) {
50 found = ih;
51 break;
52 }
53 }
54 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
55 return found;
56}
57
58unsigned int irq_create_of_mapping(struct device_node *controller,
59 const u32 *intspec, unsigned int intsize)
60{
61 struct irq_domain *ih;
62 u32 virq, type;
63 int ret;
64
65 ih = get_ih_from_node(controller);
66 if (!ih)
67 return 0;
68 ret = ih->xlate(ih, intspec, intsize, &virq, &type);
69 if (ret)
70 return 0;
71 if (type == IRQ_TYPE_NONE)
72 return virq;
73 irq_set_irq_type(virq, type);
74 return virq;
75}
76EXPORT_SYMBOL_GPL(irq_create_of_mapping);
77
78unsigned long pci_address_to_pio(phys_addr_t address) 29unsigned long pci_address_to_pio(phys_addr_t address)
79{ 30{
80 /* 31 /*
@@ -354,36 +305,43 @@ static struct of_ioapic_type of_ioapic_type[] =
354 }, 305 },
355}; 306};
356 307
357static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, 308static int ioapic_xlate(struct irq_domain *domain,
358 u32 *out_hwirq, u32 *out_type) 309 struct device_node *controller,
310 const u32 *intspec, u32 intsize,
311 irq_hw_number_t *out_hwirq, u32 *out_type)
359{ 312{
360 struct mp_ioapic_gsi *gsi_cfg;
361 struct io_apic_irq_attr attr; 313 struct io_apic_irq_attr attr;
362 struct of_ioapic_type *it; 314 struct of_ioapic_type *it;
363 u32 line, idx, type; 315 u32 line, idx;
316 int rc;
364 317
365 if (intsize < 2) 318 if (WARN_ON(intsize < 2))
366 return -EINVAL; 319 return -EINVAL;
367 320
368 line = *intspec; 321 line = intspec[0];
369 idx = (u32) id->priv;
370 gsi_cfg = mp_ioapic_gsi_routing(idx);
371 *out_hwirq = line + gsi_cfg->gsi_base;
372
373 intspec++;
374 type = *intspec;
375 322
376 if (type >= ARRAY_SIZE(of_ioapic_type)) 323 if (intspec[1] >= ARRAY_SIZE(of_ioapic_type))
377 return -EINVAL; 324 return -EINVAL;
378 325
379 it = of_ioapic_type + type; 326 it = &of_ioapic_type[intspec[1]];
380 *out_type = it->out_type;
381 327
328 idx = (u32) domain->host_data;
382 set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); 329 set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
383 330
384 return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr); 331 rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line),
332 cpu_to_node(0), &attr);
333 if (rc)
334 return rc;
335
336 *out_hwirq = line;
337 *out_type = it->out_type;
338 return 0;
385} 339}
386 340
341const struct irq_domain_ops ioapic_irq_domain_ops = {
342 .xlate = ioapic_xlate,
343};
344
387static void __init ioapic_add_ofnode(struct device_node *np) 345static void __init ioapic_add_ofnode(struct device_node *np)
388{ 346{
389 struct resource r; 347 struct resource r;
@@ -399,13 +357,14 @@ static void __init ioapic_add_ofnode(struct device_node *np)
399 for (i = 0; i < nr_ioapics; i++) { 357 for (i = 0; i < nr_ioapics; i++) {
400 if (r.start == mpc_ioapic_addr(i)) { 358 if (r.start == mpc_ioapic_addr(i)) {
401 struct irq_domain *id; 359 struct irq_domain *id;
360 struct mp_ioapic_gsi *gsi_cfg;
361
362 gsi_cfg = mp_ioapic_gsi_routing(i);
402 363
403 id = kzalloc(sizeof(*id), GFP_KERNEL); 364 id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0,
365 &ioapic_irq_domain_ops,
366 (void*)i);
404 BUG_ON(!id); 367 BUG_ON(!id);
405 id->controller = np;
406 id->xlate = ioapic_xlate;
407 id->priv = (void *)i;
408 add_interrupt_host(id);
409 return; 368 return;
410 } 369 }
411 } 370 }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 1aae78f775fc..4025fe4f928f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -252,7 +252,8 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
252 unsigned short ss; 252 unsigned short ss;
253 unsigned long sp; 253 unsigned long sp;
254#endif 254#endif
255 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); 255 printk(KERN_DEFAULT
256 "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
256#ifdef CONFIG_PREEMPT 257#ifdef CONFIG_PREEMPT
257 printk("PREEMPT "); 258 printk("PREEMPT ");
258#endif 259#endif
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index c99f9ed013d5..88ec9129271d 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -87,7 +87,7 @@ void show_registers(struct pt_regs *regs)
87 int i; 87 int i;
88 88
89 print_modules(); 89 print_modules();
90 __show_regs(regs, 0); 90 __show_regs(regs, !user_mode_vm(regs));
91 91
92 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", 92 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
93 TASK_COMM_LEN, current->comm, task_pid_nr(current), 93 TASK_COMM_LEN, current->comm, task_pid_nr(current),
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 6d728d9284bd..17107bd6e1f0 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -129,7 +129,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
129 if (!stack) { 129 if (!stack) {
130 if (regs) 130 if (regs)
131 stack = (unsigned long *)regs->sp; 131 stack = (unsigned long *)regs->sp;
132 else if (task && task != current) 132 else if (task != current)
133 stack = (unsigned long *)task->thread.sp; 133 stack = (unsigned long *)task->thread.sp;
134 else 134 else
135 stack = &dummy; 135 stack = &dummy;
@@ -269,11 +269,11 @@ void show_registers(struct pt_regs *regs)
269 unsigned char c; 269 unsigned char c;
270 u8 *ip; 270 u8 *ip;
271 271
272 printk(KERN_EMERG "Stack:\n"); 272 printk(KERN_DEFAULT "Stack:\n");
273 show_stack_log_lvl(NULL, regs, (unsigned long *)sp, 273 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
274 0, KERN_EMERG); 274 0, KERN_DEFAULT);
275 275
276 printk(KERN_EMERG "Code: "); 276 printk(KERN_DEFAULT "Code: ");
277 277
278 ip = (u8 *)regs->ip - code_prologue; 278 ip = (u8 *)regs->ip - code_prologue;
279 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { 279 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 79d97e68f042..7b784f4ef1e4 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -98,12 +98,6 @@
98#endif 98#endif
99.endm 99.endm
100 100
101#ifdef CONFIG_VM86
102#define resume_userspace_sig check_userspace
103#else
104#define resume_userspace_sig resume_userspace
105#endif
106
107/* 101/*
108 * User gs save/restore 102 * User gs save/restore
109 * 103 *
@@ -327,10 +321,19 @@ ret_from_exception:
327 preempt_stop(CLBR_ANY) 321 preempt_stop(CLBR_ANY)
328ret_from_intr: 322ret_from_intr:
329 GET_THREAD_INFO(%ebp) 323 GET_THREAD_INFO(%ebp)
330check_userspace: 324resume_userspace_sig:
325#ifdef CONFIG_VM86
331 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS 326 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
332 movb PT_CS(%esp), %al 327 movb PT_CS(%esp), %al
333 andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax 328 andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
329#else
330 /*
331 * We can be coming here from a syscall done in the kernel space,
332 * e.g. a failed kernel_execve().
333 */
334 movl PT_CS(%esp), %eax
335 andl $SEGMENT_RPL_MASK, %eax
336#endif
334 cmpl $USER_RPL, %eax 337 cmpl $USER_RPL, %eax
335 jb resume_kernel # not returning to v8086 or userspace 338 jb resume_kernel # not returning to v8086 or userspace
336 339
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3fe8239fd8fb..734ebd1d3caa 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -320,7 +320,7 @@ ENDPROC(native_usergs_sysret64)
320 movq %rsp, %rsi 320 movq %rsp, %rsi
321 321
322 leaq -RBP(%rsp),%rdi /* arg1 for handler */ 322 leaq -RBP(%rsp),%rdi /* arg1 for handler */
323 testl $3, CS(%rdi) 323 testl $3, CS-RBP(%rsi)
324 je 1f 324 je 1f
325 SWAPGS 325 SWAPGS
326 /* 326 /*
@@ -330,11 +330,10 @@ ENDPROC(native_usergs_sysret64)
330 * moving irq_enter into assembly, which would be too much work) 330 * moving irq_enter into assembly, which would be too much work)
331 */ 331 */
3321: incl PER_CPU_VAR(irq_count) 3321: incl PER_CPU_VAR(irq_count)
333 jne 2f 333 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
334 mov PER_CPU_VAR(irq_stack_ptr),%rsp
335 CFI_DEF_CFA_REGISTER rsi 334 CFI_DEF_CFA_REGISTER rsi
336 335
3372: /* Store previous stack value */ 336 /* Store previous stack value */
338 pushq %rsi 337 pushq %rsi
339 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ 338 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
340 0x77 /* DW_OP_breg7 */, 0, \ 339 0x77 /* DW_OP_breg7 */, 0, \
@@ -813,7 +812,7 @@ ret_from_intr:
813 812
814 /* Restore saved previous stack */ 813 /* Restore saved previous stack */
815 popq %rsi 814 popq %rsi
816 CFI_DEF_CFA_REGISTER rsi 815 CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
817 leaq ARGOFFSET-RBP(%rsi), %rsp 816 leaq ARGOFFSET-RBP(%rsi), %rsp
818 CFI_DEF_CFA_REGISTER rsp 817 CFI_DEF_CFA_REGISTER rsp
819 CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET 818 CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET
@@ -1530,12 +1529,20 @@ ENTRY(nmi)
1530 1529
1531 /* Use %rdx as out temp variable throughout */ 1530 /* Use %rdx as out temp variable throughout */
1532 pushq_cfi %rdx 1531 pushq_cfi %rdx
1532 CFI_REL_OFFSET rdx, 0
1533
1534 /*
1535 * If %cs was not the kernel segment, then the NMI triggered in user
1536 * space, which means it is definitely not nested.
1537 */
1538 cmpl $__KERNEL_CS, 16(%rsp)
1539 jne first_nmi
1533 1540
1534 /* 1541 /*
1535 * Check the special variable on the stack to see if NMIs are 1542 * Check the special variable on the stack to see if NMIs are
1536 * executing. 1543 * executing.
1537 */ 1544 */
1538 cmp $1, -8(%rsp) 1545 cmpl $1, -8(%rsp)
1539 je nested_nmi 1546 je nested_nmi
1540 1547
1541 /* 1548 /*
@@ -1547,6 +1554,7 @@ ENTRY(nmi)
1547 */ 1554 */
1548 lea 6*8(%rsp), %rdx 1555 lea 6*8(%rsp), %rdx
1549 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi 1556 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
1557 CFI_REMEMBER_STATE
1550 1558
1551nested_nmi: 1559nested_nmi:
1552 /* 1560 /*
@@ -1578,10 +1586,12 @@ nested_nmi:
1578 1586
1579nested_nmi_out: 1587nested_nmi_out:
1580 popq_cfi %rdx 1588 popq_cfi %rdx
1589 CFI_RESTORE rdx
1581 1590
1582 /* No need to check faults here */ 1591 /* No need to check faults here */
1583 INTERRUPT_RETURN 1592 INTERRUPT_RETURN
1584 1593
1594 CFI_RESTORE_STATE
1585first_nmi: 1595first_nmi:
1586 /* 1596 /*
1587 * Because nested NMIs will use the pushed location that we 1597 * Because nested NMIs will use the pushed location that we
@@ -1613,10 +1623,15 @@ first_nmi:
1613 * | pt_regs | 1623 * | pt_regs |
1614 * +-------------------------+ 1624 * +-------------------------+
1615 * 1625 *
1616 * The saved RIP is used to fix up the copied RIP that a nested 1626 * The saved stack frame is used to fix up the copied stack frame
1617 * NMI may zero out. The original stack frame and the temp storage 1627 * that a nested NMI may change to make the interrupted NMI iret jump
1628 * to the repeat_nmi. The original stack frame and the temp storage
1618 * is also used by nested NMIs and can not be trusted on exit. 1629 * is also used by nested NMIs and can not be trusted on exit.
1619 */ 1630 */
1631 /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
1632 movq (%rsp), %rdx
1633 CFI_RESTORE rdx
1634
1620 /* Set the NMI executing variable on the stack. */ 1635 /* Set the NMI executing variable on the stack. */
1621 pushq_cfi $1 1636 pushq_cfi $1
1622 1637
@@ -1624,22 +1639,39 @@ first_nmi:
1624 .rept 5 1639 .rept 5
1625 pushq_cfi 6*8(%rsp) 1640 pushq_cfi 6*8(%rsp)
1626 .endr 1641 .endr
1642 CFI_DEF_CFA_OFFSET SS+8-RIP
1643
1644 /* Everything up to here is safe from nested NMIs */
1645
1646 /*
1647 * If there was a nested NMI, the first NMI's iret will return
1648 * here. But NMIs are still enabled and we can take another
1649 * nested NMI. The nested NMI checks the interrupted RIP to see
1650 * if it is between repeat_nmi and end_repeat_nmi, and if so
1651 * it will just return, as we are about to repeat an NMI anyway.
1652 * This makes it safe to copy to the stack frame that a nested
1653 * NMI will update.
1654 */
1655repeat_nmi:
1656 /*
1657 * Update the stack variable to say we are still in NMI (the update
1658 * is benign for the non-repeat case, where 1 was pushed just above
1659 * to this very stack slot).
1660 */
1661 movq $1, 5*8(%rsp)
1627 1662
1628 /* Make another copy, this one may be modified by nested NMIs */ 1663 /* Make another copy, this one may be modified by nested NMIs */
1629 .rept 5 1664 .rept 5
1630 pushq_cfi 4*8(%rsp) 1665 pushq_cfi 4*8(%rsp)
1631 .endr 1666 .endr
1632 1667 CFI_DEF_CFA_OFFSET SS+8-RIP
1633 /* Do not pop rdx, nested NMIs will corrupt it */ 1668end_repeat_nmi:
1634 movq 11*8(%rsp), %rdx
1635 1669
1636 /* 1670 /*
1637 * Everything below this point can be preempted by a nested 1671 * Everything below this point can be preempted by a nested
1638 * NMI if the first NMI took an exception. Repeated NMIs 1672 * NMI if the first NMI took an exception and reset our iret stack
1639 * caused by an exception and nested NMI will start here, and 1673 * so that we repeat another NMI.
1640 * can still be preempted by another NMI.
1641 */ 1674 */
1642restart_nmi:
1643 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1675 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1644 subq $ORIG_RAX-R15, %rsp 1676 subq $ORIG_RAX-R15, %rsp
1645 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1677 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
@@ -1668,26 +1700,6 @@ nmi_restore:
1668 CFI_ENDPROC 1700 CFI_ENDPROC
1669END(nmi) 1701END(nmi)
1670 1702
1671 /*
1672 * If an NMI hit an iret because of an exception or breakpoint,
1673 * it can lose its NMI context, and a nested NMI may come in.
1674 * In that case, the nested NMI will change the preempted NMI's
1675 * stack to jump to here when it does the final iret.
1676 */
1677repeat_nmi:
1678 INTR_FRAME
1679 /* Update the stack variable to say we are still in NMI */
1680 movq $1, 5*8(%rsp)
1681
1682 /* copy the saved stack back to copy stack */
1683 .rept 5
1684 pushq_cfi 4*8(%rsp)
1685 .endr
1686
1687 jmp restart_nmi
1688 CFI_ENDPROC
1689end_repeat_nmi:
1690
1691ENTRY(ignore_sysret) 1703ENTRY(ignore_sysret)
1692 CFI_STARTPROC 1704 CFI_STARTPROC
1693 mov $-ENOSYS,%eax 1705 mov $-ENOSYS,%eax
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 739d8598f789..7734bcbb5a3a 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -16,6 +16,7 @@
16#include <asm/uaccess.h> 16#include <asm/uaccess.h>
17#include <asm/ptrace.h> 17#include <asm/ptrace.h>
18#include <asm/i387.h> 18#include <asm/i387.h>
19#include <asm/fpu-internal.h>
19#include <asm/user.h> 20#include <asm/user.h>
20 21
21#ifdef CONFIG_X86_64 22#ifdef CONFIG_X86_64
@@ -32,6 +33,86 @@
32# define user32_fxsr_struct user_fxsr_struct 33# define user32_fxsr_struct user_fxsr_struct
33#endif 34#endif
34 35
36/*
37 * Were we in an interrupt that interrupted kernel mode?
38 *
39 * We can do a kernel_fpu_begin/end() pair *ONLY* if that
40 * pair does nothing at all: the thread must not have fpu (so
41 * that we don't try to save the FPU state), and TS must
42 * be set (so that the clts/stts pair does nothing that is
43 * visible in the interrupted kernel thread).
44 */
45static inline bool interrupted_kernel_fpu_idle(void)
46{
47 return !__thread_has_fpu(current) &&
48 (read_cr0() & X86_CR0_TS);
49}
50
51/*
52 * Were we in user mode (or vm86 mode) when we were
53 * interrupted?
54 *
55 * Doing kernel_fpu_begin/end() is ok if we are running
56 * in an interrupt context from user mode - we'll just
57 * save the FPU state as required.
58 */
59static inline bool interrupted_user_mode(void)
60{
61 struct pt_regs *regs = get_irq_regs();
62 return regs && user_mode_vm(regs);
63}
64
65/*
66 * Can we use the FPU in kernel mode with the
67 * whole "kernel_fpu_begin/end()" sequence?
68 *
69 * It's always ok in process context (ie "not interrupt")
70 * but it is sometimes ok even from an irq.
71 */
72bool irq_fpu_usable(void)
73{
74 return !in_interrupt() ||
75 interrupted_user_mode() ||
76 interrupted_kernel_fpu_idle();
77}
78EXPORT_SYMBOL(irq_fpu_usable);
79
80void kernel_fpu_begin(void)
81{
82 struct task_struct *me = current;
83
84 WARN_ON_ONCE(!irq_fpu_usable());
85 preempt_disable();
86 if (__thread_has_fpu(me)) {
87 __save_init_fpu(me);
88 __thread_clear_has_fpu(me);
89 /* We do 'stts()' in kernel_fpu_end() */
90 } else {
91 percpu_write(fpu_owner_task, NULL);
92 clts();
93 }
94}
95EXPORT_SYMBOL(kernel_fpu_begin);
96
97void kernel_fpu_end(void)
98{
99 stts();
100 preempt_enable();
101}
102EXPORT_SYMBOL(kernel_fpu_end);
103
104void unlazy_fpu(struct task_struct *tsk)
105{
106 preempt_disable();
107 if (__thread_has_fpu(tsk)) {
108 __save_init_fpu(tsk);
109 __thread_fpu_end(tsk);
110 } else
111 tsk->fpu_counter = 0;
112 preempt_enable();
113}
114EXPORT_SYMBOL(unlazy_fpu);
115
35#ifdef CONFIG_MATH_EMULATION 116#ifdef CONFIG_MATH_EMULATION
36# define HAVE_HWFP (boot_cpu_data.hard_math) 117# define HAVE_HWFP (boot_cpu_data.hard_math)
37#else 118#else
@@ -44,7 +125,7 @@ EXPORT_SYMBOL_GPL(xstate_size);
44unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); 125unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
45static struct i387_fxsave_struct fx_scratch __cpuinitdata; 126static struct i387_fxsave_struct fx_scratch __cpuinitdata;
46 127
47void __cpuinit mxcsr_feature_mask_init(void) 128static void __cpuinit mxcsr_feature_mask_init(void)
48{ 129{
49 unsigned long mask = 0; 130 unsigned long mask = 0;
50 131
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 40fc86161d92..58b7f27cb3e9 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -100,13 +100,8 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
100 irqctx->tinfo.task = curctx->tinfo.task; 100 irqctx->tinfo.task = curctx->tinfo.task;
101 irqctx->tinfo.previous_esp = current_stack_pointer; 101 irqctx->tinfo.previous_esp = current_stack_pointer;
102 102
103 /* 103 /* Copy the preempt_count so that the [soft]irq checks work. */
104 * Copy the softirq bits in preempt_count so that the 104 irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count;
105 * softirq checks work in the hardirq context.
106 */
107 irqctx->tinfo.preempt_count =
108 (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
109 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
110 105
111 if (unlikely(overflow)) 106 if (unlikely(overflow))
112 call_on_stack(print_stack_overflow, isp); 107 call_on_stack(print_stack_overflow, isp);
@@ -196,7 +191,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
196 if (unlikely(!desc)) 191 if (unlikely(!desc))
197 return false; 192 return false;
198 193
199 if (!execute_on_irq_stack(overflow, desc, irq)) { 194 if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
200 if (unlikely(overflow)) 195 if (unlikely(overflow))
201 print_stack_overflow(); 196 print_stack_overflow();
202 desc->handle_irq(irq, desc); 197 desc->handle_irq(irq, desc);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 313fb5cddbce..43e2b1cff0a7 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -306,10 +306,10 @@ void __init native_init_IRQ(void)
306 * us. (some of these will be overridden and become 306 * us. (some of these will be overridden and become
307 * 'special' SMP interrupts) 307 * 'special' SMP interrupts)
308 */ 308 */
309 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 309 i = FIRST_EXTERNAL_VECTOR;
310 for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
310 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ 311 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
311 if (!test_bit(i, used_vectors)) 312 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
312 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
313 } 313 }
314 314
315 if (!acpi_ioapic && !of_ioapic) 315 if (!acpi_ioapic && !of_ioapic)
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index faba5771acad..fdc37b3d0ce3 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -67,8 +67,6 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
67 { "ss", 4, offsetof(struct pt_regs, ss) }, 67 { "ss", 4, offsetof(struct pt_regs, ss) },
68 { "ds", 4, offsetof(struct pt_regs, ds) }, 68 { "ds", 4, offsetof(struct pt_regs, ds) },
69 { "es", 4, offsetof(struct pt_regs, es) }, 69 { "es", 4, offsetof(struct pt_regs, es) },
70 { "fs", 4, -1 },
71 { "gs", 4, -1 },
72#else 70#else
73 { "ax", 8, offsetof(struct pt_regs, ax) }, 71 { "ax", 8, offsetof(struct pt_regs, ax) },
74 { "bx", 8, offsetof(struct pt_regs, bx) }, 72 { "bx", 8, offsetof(struct pt_regs, bx) },
@@ -90,7 +88,11 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
90 { "flags", 4, offsetof(struct pt_regs, flags) }, 88 { "flags", 4, offsetof(struct pt_regs, flags) },
91 { "cs", 4, offsetof(struct pt_regs, cs) }, 89 { "cs", 4, offsetof(struct pt_regs, cs) },
92 { "ss", 4, offsetof(struct pt_regs, ss) }, 90 { "ss", 4, offsetof(struct pt_regs, ss) },
91 { "ds", 4, -1 },
92 { "es", 4, -1 },
93#endif 93#endif
94 { "fs", 4, -1 },
95 { "gs", 4, -1 },
94}; 96};
95 97
96int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) 98int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes-common.h
new file mode 100644
index 000000000000..3230b68ef29a
--- /dev/null
+++ b/arch/x86/kernel/kprobes-common.h
@@ -0,0 +1,102 @@
1#ifndef __X86_KERNEL_KPROBES_COMMON_H
2#define __X86_KERNEL_KPROBES_COMMON_H
3
4/* Kprobes and Optprobes common header */
5
6#ifdef CONFIG_X86_64
7#define SAVE_REGS_STRING \
8 /* Skip cs, ip, orig_ax. */ \
9 " subq $24, %rsp\n" \
10 " pushq %rdi\n" \
11 " pushq %rsi\n" \
12 " pushq %rdx\n" \
13 " pushq %rcx\n" \
14 " pushq %rax\n" \
15 " pushq %r8\n" \
16 " pushq %r9\n" \
17 " pushq %r10\n" \
18 " pushq %r11\n" \
19 " pushq %rbx\n" \
20 " pushq %rbp\n" \
21 " pushq %r12\n" \
22 " pushq %r13\n" \
23 " pushq %r14\n" \
24 " pushq %r15\n"
25#define RESTORE_REGS_STRING \
26 " popq %r15\n" \
27 " popq %r14\n" \
28 " popq %r13\n" \
29 " popq %r12\n" \
30 " popq %rbp\n" \
31 " popq %rbx\n" \
32 " popq %r11\n" \
33 " popq %r10\n" \
34 " popq %r9\n" \
35 " popq %r8\n" \
36 " popq %rax\n" \
37 " popq %rcx\n" \
38 " popq %rdx\n" \
39 " popq %rsi\n" \
40 " popq %rdi\n" \
41 /* Skip orig_ax, ip, cs */ \
42 " addq $24, %rsp\n"
43#else
44#define SAVE_REGS_STRING \
45 /* Skip cs, ip, orig_ax and gs. */ \
46 " subl $16, %esp\n" \
47 " pushl %fs\n" \
48 " pushl %es\n" \
49 " pushl %ds\n" \
50 " pushl %eax\n" \
51 " pushl %ebp\n" \
52 " pushl %edi\n" \
53 " pushl %esi\n" \
54 " pushl %edx\n" \
55 " pushl %ecx\n" \
56 " pushl %ebx\n"
57#define RESTORE_REGS_STRING \
58 " popl %ebx\n" \
59 " popl %ecx\n" \
60 " popl %edx\n" \
61 " popl %esi\n" \
62 " popl %edi\n" \
63 " popl %ebp\n" \
64 " popl %eax\n" \
65 /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
66 " addl $24, %esp\n"
67#endif
68
69/* Ensure if the instruction can be boostable */
70extern int can_boost(kprobe_opcode_t *instruction);
71/* Recover instruction if given address is probed */
72extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
73 unsigned long addr);
74/*
75 * Copy an instruction and adjust the displacement if the instruction
76 * uses the %rip-relative addressing mode.
77 */
78extern int __copy_instruction(u8 *dest, u8 *src);
79
80/* Generate a relative-jump/call instruction */
81extern void synthesize_reljump(void *from, void *to);
82extern void synthesize_relcall(void *from, void *to);
83
84#ifdef CONFIG_OPTPROBES
85extern int arch_init_optprobes(void);
86extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter);
87extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr);
88#else /* !CONFIG_OPTPROBES */
89static inline int arch_init_optprobes(void)
90{
91 return 0;
92}
93static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
94{
95 return 0;
96}
97static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
98{
99 return addr;
100}
101#endif
102#endif
diff --git a/arch/x86/kernel/kprobes-opt.c b/arch/x86/kernel/kprobes-opt.c
new file mode 100644
index 000000000000..c5e410eed403
--- /dev/null
+++ b/arch/x86/kernel/kprobes-opt.c
@@ -0,0 +1,512 @@
1/*
2 * Kernel Probes Jump Optimization (Optprobes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004
19 * Copyright (C) Hitachi Ltd., 2012
20 */
21#include <linux/kprobes.h>
22#include <linux/ptrace.h>
23#include <linux/string.h>
24#include <linux/slab.h>
25#include <linux/hardirq.h>
26#include <linux/preempt.h>
27#include <linux/module.h>
28#include <linux/kdebug.h>
29#include <linux/kallsyms.h>
30#include <linux/ftrace.h>
31
32#include <asm/cacheflush.h>
33#include <asm/desc.h>
34#include <asm/pgtable.h>
35#include <asm/uaccess.h>
36#include <asm/alternative.h>
37#include <asm/insn.h>
38#include <asm/debugreg.h>
39
40#include "kprobes-common.h"
41
42unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
43{
44 struct optimized_kprobe *op;
45 struct kprobe *kp;
46 long offs;
47 int i;
48
49 for (i = 0; i < RELATIVEJUMP_SIZE; i++) {
50 kp = get_kprobe((void *)addr - i);
51 /* This function only handles jump-optimized kprobe */
52 if (kp && kprobe_optimized(kp)) {
53 op = container_of(kp, struct optimized_kprobe, kp);
54 /* If op->list is not empty, op is under optimizing */
55 if (list_empty(&op->list))
56 goto found;
57 }
58 }
59
60 return addr;
61found:
62 /*
63 * If the kprobe can be optimized, original bytes which can be
64 * overwritten by jump destination address. In this case, original
65 * bytes must be recovered from op->optinsn.copied_insn buffer.
66 */
67 memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
68 if (addr == (unsigned long)kp->addr) {
69 buf[0] = kp->opcode;
70 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
71 } else {
72 offs = addr - (unsigned long)kp->addr - 1;
73 memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs);
74 }
75
76 return (unsigned long)buf;
77}
78
79/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
80static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
81{
82#ifdef CONFIG_X86_64
83 *addr++ = 0x48;
84 *addr++ = 0xbf;
85#else
86 *addr++ = 0xb8;
87#endif
88 *(unsigned long *)addr = val;
89}
90
91static void __used __kprobes kprobes_optinsn_template_holder(void)
92{
93 asm volatile (
94 ".global optprobe_template_entry\n"
95 "optprobe_template_entry:\n"
96#ifdef CONFIG_X86_64
97 /* We don't bother saving the ss register */
98 " pushq %rsp\n"
99 " pushfq\n"
100 SAVE_REGS_STRING
101 " movq %rsp, %rsi\n"
102 ".global optprobe_template_val\n"
103 "optprobe_template_val:\n"
104 ASM_NOP5
105 ASM_NOP5
106 ".global optprobe_template_call\n"
107 "optprobe_template_call:\n"
108 ASM_NOP5
109 /* Move flags to rsp */
110 " movq 144(%rsp), %rdx\n"
111 " movq %rdx, 152(%rsp)\n"
112 RESTORE_REGS_STRING
113 /* Skip flags entry */
114 " addq $8, %rsp\n"
115 " popfq\n"
116#else /* CONFIG_X86_32 */
117 " pushf\n"
118 SAVE_REGS_STRING
119 " movl %esp, %edx\n"
120 ".global optprobe_template_val\n"
121 "optprobe_template_val:\n"
122 ASM_NOP5
123 ".global optprobe_template_call\n"
124 "optprobe_template_call:\n"
125 ASM_NOP5
126 RESTORE_REGS_STRING
127 " addl $4, %esp\n" /* skip cs */
128 " popf\n"
129#endif
130 ".global optprobe_template_end\n"
131 "optprobe_template_end:\n");
132}
133
134#define TMPL_MOVE_IDX \
135 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
136#define TMPL_CALL_IDX \
137 ((long)&optprobe_template_call - (long)&optprobe_template_entry)
138#define TMPL_END_IDX \
139 ((long)&optprobe_template_end - (long)&optprobe_template_entry)
140
141#define INT3_SIZE sizeof(kprobe_opcode_t)
142
143/* Optimized kprobe call back function: called from optinsn */
144static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
145{
146 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
147 unsigned long flags;
148
149 /* This is possible if op is under delayed unoptimizing */
150 if (kprobe_disabled(&op->kp))
151 return;
152
153 local_irq_save(flags);
154 if (kprobe_running()) {
155 kprobes_inc_nmissed_count(&op->kp);
156 } else {
157 /* Save skipped registers */
158#ifdef CONFIG_X86_64
159 regs->cs = __KERNEL_CS;
160#else
161 regs->cs = __KERNEL_CS | get_kernel_rpl();
162 regs->gs = 0;
163#endif
164 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
165 regs->orig_ax = ~0UL;
166
167 __this_cpu_write(current_kprobe, &op->kp);
168 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
169 opt_pre_handler(&op->kp, regs);
170 __this_cpu_write(current_kprobe, NULL);
171 }
172 local_irq_restore(flags);
173}
174
175static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
176{
177 int len = 0, ret;
178
179 while (len < RELATIVEJUMP_SIZE) {
180 ret = __copy_instruction(dest + len, src + len);
181 if (!ret || !can_boost(dest + len))
182 return -EINVAL;
183 len += ret;
184 }
185 /* Check whether the address range is reserved */
186 if (ftrace_text_reserved(src, src + len - 1) ||
187 alternatives_text_reserved(src, src + len - 1) ||
188 jump_label_text_reserved(src, src + len - 1))
189 return -EBUSY;
190
191 return len;
192}
193
194/* Check whether insn is indirect jump */
195static int __kprobes insn_is_indirect_jump(struct insn *insn)
196{
197 return ((insn->opcode.bytes[0] == 0xff &&
198 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
199 insn->opcode.bytes[0] == 0xea); /* Segment based jump */
200}
201
202/* Check whether insn jumps into specified address range */
203static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
204{
205 unsigned long target = 0;
206
207 switch (insn->opcode.bytes[0]) {
208 case 0xe0: /* loopne */
209 case 0xe1: /* loope */
210 case 0xe2: /* loop */
211 case 0xe3: /* jcxz */
212 case 0xe9: /* near relative jump */
213 case 0xeb: /* short relative jump */
214 break;
215 case 0x0f:
216 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
217 break;
218 return 0;
219 default:
220 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
221 break;
222 return 0;
223 }
224 target = (unsigned long)insn->next_byte + insn->immediate.value;
225
226 return (start <= target && target <= start + len);
227}
228
229/* Decode whole function to ensure any instructions don't jump into target */
230static int __kprobes can_optimize(unsigned long paddr)
231{
232 unsigned long addr, size = 0, offset = 0;
233 struct insn insn;
234 kprobe_opcode_t buf[MAX_INSN_SIZE];
235
236 /* Lookup symbol including addr */
237 if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
238 return 0;
239
240 /*
241 * Do not optimize in the entry code due to the unstable
242 * stack handling.
243 */
244 if ((paddr >= (unsigned long)__entry_text_start) &&
245 (paddr < (unsigned long)__entry_text_end))
246 return 0;
247
248 /* Check there is enough space for a relative jump. */
249 if (size - offset < RELATIVEJUMP_SIZE)
250 return 0;
251
252 /* Decode instructions */
253 addr = paddr - offset;
254 while (addr < paddr - offset + size) { /* Decode until function end */
255 if (search_exception_tables(addr))
256 /*
257 * Since some fixup code will jumps into this function,
258 * we can't optimize kprobe in this function.
259 */
260 return 0;
261 kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr));
262 insn_get_length(&insn);
263 /* Another subsystem puts a breakpoint */
264 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
265 return 0;
266 /* Recover address */
267 insn.kaddr = (void *)addr;
268 insn.next_byte = (void *)(addr + insn.length);
269 /* Check any instructions don't jump into target */
270 if (insn_is_indirect_jump(&insn) ||
271 insn_jump_into_range(&insn, paddr + INT3_SIZE,
272 RELATIVE_ADDR_SIZE))
273 return 0;
274 addr += insn.length;
275 }
276
277 return 1;
278}
279
280/* Check optimized_kprobe can actually be optimized. */
281int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
282{
283 int i;
284 struct kprobe *p;
285
286 for (i = 1; i < op->optinsn.size; i++) {
287 p = get_kprobe(op->kp.addr + i);
288 if (p && !kprobe_disabled(p))
289 return -EEXIST;
290 }
291
292 return 0;
293}
294
295/* Check the addr is within the optimized instructions. */
296int __kprobes
297arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr)
298{
299 return ((unsigned long)op->kp.addr <= addr &&
300 (unsigned long)op->kp.addr + op->optinsn.size > addr);
301}
302
303/* Free optimized instruction slot */
304static __kprobes
305void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
306{
307 if (op->optinsn.insn) {
308 free_optinsn_slot(op->optinsn.insn, dirty);
309 op->optinsn.insn = NULL;
310 op->optinsn.size = 0;
311 }
312}
313
314void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
315{
316 __arch_remove_optimized_kprobe(op, 1);
317}
318
319/*
320 * Copy replacing target instructions
321 * Target instructions MUST be relocatable (checked inside)
322 * This is called when new aggr(opt)probe is allocated or reused.
323 */
324int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
325{
326 u8 *buf;
327 int ret;
328 long rel;
329
330 if (!can_optimize((unsigned long)op->kp.addr))
331 return -EILSEQ;
332
333 op->optinsn.insn = get_optinsn_slot();
334 if (!op->optinsn.insn)
335 return -ENOMEM;
336
337 /*
338 * Verify if the address gap is in 2GB range, because this uses
339 * a relative jump.
340 */
341 rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
342 if (abs(rel) > 0x7fffffff)
343 return -ERANGE;
344
345 buf = (u8 *)op->optinsn.insn;
346
347 /* Copy instructions into the out-of-line buffer */
348 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
349 if (ret < 0) {
350 __arch_remove_optimized_kprobe(op, 0);
351 return ret;
352 }
353 op->optinsn.size = ret;
354
355 /* Copy arch-dep-instance from template */
356 memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
357
358 /* Set probe information */
359 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
360
361 /* Set probe function call */
362 synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
363
364 /* Set returning jmp instruction at the tail of out-of-line buffer */
365 synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
366 (u8 *)op->kp.addr + op->optinsn.size);
367
368 flush_icache_range((unsigned long) buf,
369 (unsigned long) buf + TMPL_END_IDX +
370 op->optinsn.size + RELATIVEJUMP_SIZE);
371 return 0;
372}
373
374#define MAX_OPTIMIZE_PROBES 256
375static struct text_poke_param *jump_poke_params;
376static struct jump_poke_buffer {
377 u8 buf[RELATIVEJUMP_SIZE];
378} *jump_poke_bufs;
379
380static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
381 u8 *insn_buf,
382 struct optimized_kprobe *op)
383{
384 s32 rel = (s32)((long)op->optinsn.insn -
385 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
386
387 /* Backup instructions which will be replaced by jump address */
388 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
389 RELATIVE_ADDR_SIZE);
390
391 insn_buf[0] = RELATIVEJUMP_OPCODE;
392 *(s32 *)(&insn_buf[1]) = rel;
393
394 tprm->addr = op->kp.addr;
395 tprm->opcode = insn_buf;
396 tprm->len = RELATIVEJUMP_SIZE;
397}
398
399/*
400 * Replace breakpoints (int3) with relative jumps.
401 * Caller must call with locking kprobe_mutex and text_mutex.
402 */
403void __kprobes arch_optimize_kprobes(struct list_head *oplist)
404{
405 struct optimized_kprobe *op, *tmp;
406 int c = 0;
407
408 list_for_each_entry_safe(op, tmp, oplist, list) {
409 WARN_ON(kprobe_disabled(&op->kp));
410 /* Setup param */
411 setup_optimize_kprobe(&jump_poke_params[c],
412 jump_poke_bufs[c].buf, op);
413 list_del_init(&op->list);
414 if (++c >= MAX_OPTIMIZE_PROBES)
415 break;
416 }
417
418 /*
419 * text_poke_smp doesn't support NMI/MCE code modifying.
420 * However, since kprobes itself also doesn't support NMI/MCE
421 * code probing, it's not a problem.
422 */
423 text_poke_smp_batch(jump_poke_params, c);
424}
425
426static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
427 u8 *insn_buf,
428 struct optimized_kprobe *op)
429{
430 /* Set int3 to first byte for kprobes */
431 insn_buf[0] = BREAKPOINT_INSTRUCTION;
432 memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
433
434 tprm->addr = op->kp.addr;
435 tprm->opcode = insn_buf;
436 tprm->len = RELATIVEJUMP_SIZE;
437}
438
439/*
440 * Recover original instructions and breakpoints from relative jumps.
441 * Caller must call with locking kprobe_mutex.
442 */
443extern void arch_unoptimize_kprobes(struct list_head *oplist,
444 struct list_head *done_list)
445{
446 struct optimized_kprobe *op, *tmp;
447 int c = 0;
448
449 list_for_each_entry_safe(op, tmp, oplist, list) {
450 /* Setup param */
451 setup_unoptimize_kprobe(&jump_poke_params[c],
452 jump_poke_bufs[c].buf, op);
453 list_move(&op->list, done_list);
454 if (++c >= MAX_OPTIMIZE_PROBES)
455 break;
456 }
457
458 /*
459 * text_poke_smp doesn't support NMI/MCE code modifying.
460 * However, since kprobes itself also doesn't support NMI/MCE
461 * code probing, it's not a problem.
462 */
463 text_poke_smp_batch(jump_poke_params, c);
464}
465
466/* Replace a relative jump with a breakpoint (int3). */
467void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
468{
469 u8 buf[RELATIVEJUMP_SIZE];
470
471 /* Set int3 to first byte for kprobes */
472 buf[0] = BREAKPOINT_INSTRUCTION;
473 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
474 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
475}
476
477int __kprobes
478setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
479{
480 struct optimized_kprobe *op;
481
482 if (p->flags & KPROBE_FLAG_OPTIMIZED) {
483 /* This kprobe is really able to run optimized path. */
484 op = container_of(p, struct optimized_kprobe, kp);
485 /* Detour through copied instructions */
486 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
487 if (!reenter)
488 reset_current_kprobe();
489 preempt_enable_no_resched();
490 return 1;
491 }
492 return 0;
493}
494
495int __kprobes arch_init_optprobes(void)
496{
497 /* Allocate code buffer and parameter array */
498 jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
499 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
500 if (!jump_poke_bufs)
501 return -ENOMEM;
502
503 jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
504 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
505 if (!jump_poke_params) {
506 kfree(jump_poke_bufs);
507 jump_poke_bufs = NULL;
508 return -ENOMEM;
509 }
510
511 return 0;
512}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7da647d8b64c..e213fc8408d2 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -30,16 +30,15 @@
30 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi 30 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
31 * <prasanna@in.ibm.com> added function-return probes. 31 * <prasanna@in.ibm.com> added function-return probes.
32 * 2005-May Rusty Lynch <rusty.lynch@intel.com> 32 * 2005-May Rusty Lynch <rusty.lynch@intel.com>
33 * Added function return probes functionality 33 * Added function return probes functionality
34 * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added 34 * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
35 * kprobe-booster and kretprobe-booster for i386. 35 * kprobe-booster and kretprobe-booster for i386.
36 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster 36 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
37 * and kretprobe-booster for x86-64 37 * and kretprobe-booster for x86-64
38 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven 38 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
39 * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com> 39 * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
40 * unified x86 kprobes code. 40 * unified x86 kprobes code.
41 */ 41 */
42
43#include <linux/kprobes.h> 42#include <linux/kprobes.h>
44#include <linux/ptrace.h> 43#include <linux/ptrace.h>
45#include <linux/string.h> 44#include <linux/string.h>
@@ -59,6 +58,8 @@
59#include <asm/insn.h> 58#include <asm/insn.h>
60#include <asm/debugreg.h> 59#include <asm/debugreg.h>
61 60
61#include "kprobes-common.h"
62
62void jprobe_return_end(void); 63void jprobe_return_end(void);
63 64
64DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; 65DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
@@ -108,6 +109,7 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
108 doesn't switch kernel stack.*/ 109 doesn't switch kernel stack.*/
109 {NULL, NULL} /* Terminator */ 110 {NULL, NULL} /* Terminator */
110}; 111};
112
111const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); 113const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
112 114
113static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) 115static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
@@ -123,11 +125,17 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
123} 125}
124 126
125/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ 127/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
126static void __kprobes synthesize_reljump(void *from, void *to) 128void __kprobes synthesize_reljump(void *from, void *to)
127{ 129{
128 __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); 130 __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
129} 131}
130 132
133/* Insert a call instruction at address 'from', which calls address 'to'.*/
134void __kprobes synthesize_relcall(void *from, void *to)
135{
136 __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
137}
138
131/* 139/*
132 * Skip the prefixes of the instruction. 140 * Skip the prefixes of the instruction.
133 */ 141 */
@@ -151,7 +159,7 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
151 * Returns non-zero if opcode is boostable. 159 * Returns non-zero if opcode is boostable.
152 * RIP relative instructions are adjusted at copying time in 64 bits mode 160 * RIP relative instructions are adjusted at copying time in 64 bits mode
153 */ 161 */
154static int __kprobes can_boost(kprobe_opcode_t *opcodes) 162int __kprobes can_boost(kprobe_opcode_t *opcodes)
155{ 163{
156 kprobe_opcode_t opcode; 164 kprobe_opcode_t opcode;
157 kprobe_opcode_t *orig_opcodes = opcodes; 165 kprobe_opcode_t *orig_opcodes = opcodes;
@@ -207,13 +215,15 @@ retry:
207 } 215 }
208} 216}
209 217
210/* Recover the probed instruction at addr for further analysis. */ 218static unsigned long
211static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) 219__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
212{ 220{
213 struct kprobe *kp; 221 struct kprobe *kp;
222
214 kp = get_kprobe((void *)addr); 223 kp = get_kprobe((void *)addr);
224 /* There is no probe, return original address */
215 if (!kp) 225 if (!kp)
216 return -EINVAL; 226 return addr;
217 227
218 /* 228 /*
219 * Basically, kp->ainsn.insn has an original instruction. 229 * Basically, kp->ainsn.insn has an original instruction.
@@ -230,14 +240,29 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
230 */ 240 */
231 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); 241 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
232 buf[0] = kp->opcode; 242 buf[0] = kp->opcode;
233 return 0; 243 return (unsigned long)buf;
244}
245
246/*
247 * Recover the probed instruction at addr for further analysis.
248 * Caller must lock kprobes by kprobe_mutex, or disable preemption
249 * for preventing to release referencing kprobes.
250 */
251unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
252{
253 unsigned long __addr;
254
255 __addr = __recover_optprobed_insn(buf, addr);
256 if (__addr != addr)
257 return __addr;
258
259 return __recover_probed_insn(buf, addr);
234} 260}
235 261
236/* Check if paddr is at an instruction boundary */ 262/* Check if paddr is at an instruction boundary */
237static int __kprobes can_probe(unsigned long paddr) 263static int __kprobes can_probe(unsigned long paddr)
238{ 264{
239 int ret; 265 unsigned long addr, __addr, offset = 0;
240 unsigned long addr, offset = 0;
241 struct insn insn; 266 struct insn insn;
242 kprobe_opcode_t buf[MAX_INSN_SIZE]; 267 kprobe_opcode_t buf[MAX_INSN_SIZE];
243 268
@@ -247,26 +272,24 @@ static int __kprobes can_probe(unsigned long paddr)
247 /* Decode instructions */ 272 /* Decode instructions */
248 addr = paddr - offset; 273 addr = paddr - offset;
249 while (addr < paddr) { 274 while (addr < paddr) {
250 kernel_insn_init(&insn, (void *)addr);
251 insn_get_opcode(&insn);
252
253 /* 275 /*
254 * Check if the instruction has been modified by another 276 * Check if the instruction has been modified by another
255 * kprobe, in which case we replace the breakpoint by the 277 * kprobe, in which case we replace the breakpoint by the
256 * original instruction in our buffer. 278 * original instruction in our buffer.
279 * Also, jump optimization will change the breakpoint to
280 * relative-jump. Since the relative-jump itself is
281 * normally used, we just go through if there is no kprobe.
257 */ 282 */
258 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { 283 __addr = recover_probed_instruction(buf, addr);
259 ret = recover_probed_instruction(buf, addr); 284 kernel_insn_init(&insn, (void *)__addr);
260 if (ret)
261 /*
262 * Another debugging subsystem might insert
263 * this breakpoint. In that case, we can't
264 * recover it.
265 */
266 return 0;
267 kernel_insn_init(&insn, buf);
268 }
269 insn_get_length(&insn); 285 insn_get_length(&insn);
286
287 /*
288 * Another debugging subsystem might insert this breakpoint.
289 * In that case, we can't recover it.
290 */
291 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
292 return 0;
270 addr += insn.length; 293 addr += insn.length;
271 } 294 }
272 295
@@ -299,24 +322,16 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
299 * If not, return null. 322 * If not, return null.
300 * Only applicable to 64-bit x86. 323 * Only applicable to 64-bit x86.
301 */ 324 */
302static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) 325int __kprobes __copy_instruction(u8 *dest, u8 *src)
303{ 326{
304 struct insn insn; 327 struct insn insn;
305 int ret;
306 kprobe_opcode_t buf[MAX_INSN_SIZE]; 328 kprobe_opcode_t buf[MAX_INSN_SIZE];
307 329
308 kernel_insn_init(&insn, src); 330 kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src));
309 if (recover) {
310 insn_get_opcode(&insn);
311 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
312 ret = recover_probed_instruction(buf,
313 (unsigned long)src);
314 if (ret)
315 return 0;
316 kernel_insn_init(&insn, buf);
317 }
318 }
319 insn_get_length(&insn); 331 insn_get_length(&insn);
332 /* Another subsystem puts a breakpoint, failed to recover */
333 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
334 return 0;
320 memcpy(dest, insn.kaddr, insn.length); 335 memcpy(dest, insn.kaddr, insn.length);
321 336
322#ifdef CONFIG_X86_64 337#ifdef CONFIG_X86_64
@@ -337,8 +352,7 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
337 * extension of the original signed 32-bit displacement would 352 * extension of the original signed 32-bit displacement would
338 * have given. 353 * have given.
339 */ 354 */
340 newdisp = (u8 *) src + (s64) insn.displacement.value - 355 newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest;
341 (u8 *) dest;
342 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ 356 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
343 disp = (u8 *) dest + insn_offset_displacement(&insn); 357 disp = (u8 *) dest + insn_offset_displacement(&insn);
344 *(s32 *) disp = (s32) newdisp; 358 *(s32 *) disp = (s32) newdisp;
@@ -349,18 +363,20 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
349 363
350static void __kprobes arch_copy_kprobe(struct kprobe *p) 364static void __kprobes arch_copy_kprobe(struct kprobe *p)
351{ 365{
366 /* Copy an instruction with recovering if other optprobe modifies it.*/
367 __copy_instruction(p->ainsn.insn, p->addr);
368
352 /* 369 /*
353 * Copy an instruction without recovering int3, because it will be 370 * __copy_instruction can modify the displacement of the instruction,
354 * put by another subsystem. 371 * but it doesn't affect boostable check.
355 */ 372 */
356 __copy_instruction(p->ainsn.insn, p->addr, 0); 373 if (can_boost(p->ainsn.insn))
357
358 if (can_boost(p->addr))
359 p->ainsn.boostable = 0; 374 p->ainsn.boostable = 0;
360 else 375 else
361 p->ainsn.boostable = -1; 376 p->ainsn.boostable = -1;
362 377
363 p->opcode = *p->addr; 378 /* Also, displacement change doesn't affect the first byte */
379 p->opcode = p->ainsn.insn[0];
364} 380}
365 381
366int __kprobes arch_prepare_kprobe(struct kprobe *p) 382int __kprobes arch_prepare_kprobe(struct kprobe *p)
@@ -442,8 +458,8 @@ static void __kprobes restore_btf(void)
442 } 458 }
443} 459}
444 460
445void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 461void __kprobes
446 struct pt_regs *regs) 462arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
447{ 463{
448 unsigned long *sara = stack_addr(regs); 464 unsigned long *sara = stack_addr(regs);
449 465
@@ -453,16 +469,8 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
453 *sara = (unsigned long) &kretprobe_trampoline; 469 *sara = (unsigned long) &kretprobe_trampoline;
454} 470}
455 471
456#ifdef CONFIG_OPTPROBES 472static void __kprobes
457static int __kprobes setup_detour_execution(struct kprobe *p, 473setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter)
458 struct pt_regs *regs,
459 int reenter);
460#else
461#define setup_detour_execution(p, regs, reenter) (0)
462#endif
463
464static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
465 struct kprobe_ctlblk *kcb, int reenter)
466{ 474{
467 if (setup_detour_execution(p, regs, reenter)) 475 if (setup_detour_execution(p, regs, reenter))
468 return; 476 return;
@@ -504,8 +512,8 @@ static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
504 * within the handler. We save the original kprobes variables and just single 512 * within the handler. We save the original kprobes variables and just single
505 * step on the instruction of the new probe without calling any user handlers. 513 * step on the instruction of the new probe without calling any user handlers.
506 */ 514 */
507static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, 515static int __kprobes
508 struct kprobe_ctlblk *kcb) 516reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
509{ 517{
510 switch (kcb->kprobe_status) { 518 switch (kcb->kprobe_status) {
511 case KPROBE_HIT_SSDONE: 519 case KPROBE_HIT_SSDONE:
@@ -600,69 +608,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
600 return 0; 608 return 0;
601} 609}
602 610
603#ifdef CONFIG_X86_64
604#define SAVE_REGS_STRING \
605 /* Skip cs, ip, orig_ax. */ \
606 " subq $24, %rsp\n" \
607 " pushq %rdi\n" \
608 " pushq %rsi\n" \
609 " pushq %rdx\n" \
610 " pushq %rcx\n" \
611 " pushq %rax\n" \
612 " pushq %r8\n" \
613 " pushq %r9\n" \
614 " pushq %r10\n" \
615 " pushq %r11\n" \
616 " pushq %rbx\n" \
617 " pushq %rbp\n" \
618 " pushq %r12\n" \
619 " pushq %r13\n" \
620 " pushq %r14\n" \
621 " pushq %r15\n"
622#define RESTORE_REGS_STRING \
623 " popq %r15\n" \
624 " popq %r14\n" \
625 " popq %r13\n" \
626 " popq %r12\n" \
627 " popq %rbp\n" \
628 " popq %rbx\n" \
629 " popq %r11\n" \
630 " popq %r10\n" \
631 " popq %r9\n" \
632 " popq %r8\n" \
633 " popq %rax\n" \
634 " popq %rcx\n" \
635 " popq %rdx\n" \
636 " popq %rsi\n" \
637 " popq %rdi\n" \
638 /* Skip orig_ax, ip, cs */ \
639 " addq $24, %rsp\n"
640#else
641#define SAVE_REGS_STRING \
642 /* Skip cs, ip, orig_ax and gs. */ \
643 " subl $16, %esp\n" \
644 " pushl %fs\n" \
645 " pushl %es\n" \
646 " pushl %ds\n" \
647 " pushl %eax\n" \
648 " pushl %ebp\n" \
649 " pushl %edi\n" \
650 " pushl %esi\n" \
651 " pushl %edx\n" \
652 " pushl %ecx\n" \
653 " pushl %ebx\n"
654#define RESTORE_REGS_STRING \
655 " popl %ebx\n" \
656 " popl %ecx\n" \
657 " popl %edx\n" \
658 " popl %esi\n" \
659 " popl %edi\n" \
660 " popl %ebp\n" \
661 " popl %eax\n" \
662 /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
663 " addl $24, %esp\n"
664#endif
665
666/* 611/*
667 * When a retprobed function returns, this code saves registers and 612 * When a retprobed function returns, this code saves registers and
668 * calls trampoline_handler() runs, which calls the kretprobe's handler. 613 * calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -816,8 +761,8 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
816 * jump instruction after the copied instruction, that jumps to the next 761 * jump instruction after the copied instruction, that jumps to the next
817 * instruction after the probepoint. 762 * instruction after the probepoint.
818 */ 763 */
819static void __kprobes resume_execution(struct kprobe *p, 764static void __kprobes
820 struct pt_regs *regs, struct kprobe_ctlblk *kcb) 765resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
821{ 766{
822 unsigned long *tos = stack_addr(regs); 767 unsigned long *tos = stack_addr(regs);
823 unsigned long copy_ip = (unsigned long)p->ainsn.insn; 768 unsigned long copy_ip = (unsigned long)p->ainsn.insn;
@@ -996,8 +941,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
996/* 941/*
997 * Wrapper routine for handling exceptions. 942 * Wrapper routine for handling exceptions.
998 */ 943 */
999int __kprobes kprobe_exceptions_notify(struct notifier_block *self, 944int __kprobes
1000 unsigned long val, void *data) 945kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data)
1001{ 946{
1002 struct die_args *args = data; 947 struct die_args *args = data;
1003 int ret = NOTIFY_DONE; 948 int ret = NOTIFY_DONE;
@@ -1107,466 +1052,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1107 return 0; 1052 return 0;
1108} 1053}
1109 1054
1110
1111#ifdef CONFIG_OPTPROBES
1112
1113/* Insert a call instruction at address 'from', which calls address 'to'.*/
1114static void __kprobes synthesize_relcall(void *from, void *to)
1115{
1116 __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
1117}
1118
1119/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
1120static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
1121 unsigned long val)
1122{
1123#ifdef CONFIG_X86_64
1124 *addr++ = 0x48;
1125 *addr++ = 0xbf;
1126#else
1127 *addr++ = 0xb8;
1128#endif
1129 *(unsigned long *)addr = val;
1130}
1131
1132static void __used __kprobes kprobes_optinsn_template_holder(void)
1133{
1134 asm volatile (
1135 ".global optprobe_template_entry\n"
1136 "optprobe_template_entry: \n"
1137#ifdef CONFIG_X86_64
1138 /* We don't bother saving the ss register */
1139 " pushq %rsp\n"
1140 " pushfq\n"
1141 SAVE_REGS_STRING
1142 " movq %rsp, %rsi\n"
1143 ".global optprobe_template_val\n"
1144 "optprobe_template_val: \n"
1145 ASM_NOP5
1146 ASM_NOP5
1147 ".global optprobe_template_call\n"
1148 "optprobe_template_call: \n"
1149 ASM_NOP5
1150 /* Move flags to rsp */
1151 " movq 144(%rsp), %rdx\n"
1152 " movq %rdx, 152(%rsp)\n"
1153 RESTORE_REGS_STRING
1154 /* Skip flags entry */
1155 " addq $8, %rsp\n"
1156 " popfq\n"
1157#else /* CONFIG_X86_32 */
1158 " pushf\n"
1159 SAVE_REGS_STRING
1160 " movl %esp, %edx\n"
1161 ".global optprobe_template_val\n"
1162 "optprobe_template_val: \n"
1163 ASM_NOP5
1164 ".global optprobe_template_call\n"
1165 "optprobe_template_call: \n"
1166 ASM_NOP5
1167 RESTORE_REGS_STRING
1168 " addl $4, %esp\n" /* skip cs */
1169 " popf\n"
1170#endif
1171 ".global optprobe_template_end\n"
1172 "optprobe_template_end: \n");
1173}
1174
1175#define TMPL_MOVE_IDX \
1176 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
1177#define TMPL_CALL_IDX \
1178 ((long)&optprobe_template_call - (long)&optprobe_template_entry)
1179#define TMPL_END_IDX \
1180 ((long)&optprobe_template_end - (long)&optprobe_template_entry)
1181
1182#define INT3_SIZE sizeof(kprobe_opcode_t)
1183
1184/* Optimized kprobe call back function: called from optinsn */
1185static void __kprobes optimized_callback(struct optimized_kprobe *op,
1186 struct pt_regs *regs)
1187{
1188 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1189 unsigned long flags;
1190
1191 /* This is possible if op is under delayed unoptimizing */
1192 if (kprobe_disabled(&op->kp))
1193 return;
1194
1195 local_irq_save(flags);
1196 if (kprobe_running()) {
1197 kprobes_inc_nmissed_count(&op->kp);
1198 } else {
1199 /* Save skipped registers */
1200#ifdef CONFIG_X86_64
1201 regs->cs = __KERNEL_CS;
1202#else
1203 regs->cs = __KERNEL_CS | get_kernel_rpl();
1204 regs->gs = 0;
1205#endif
1206 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
1207 regs->orig_ax = ~0UL;
1208
1209 __this_cpu_write(current_kprobe, &op->kp);
1210 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1211 opt_pre_handler(&op->kp, regs);
1212 __this_cpu_write(current_kprobe, NULL);
1213 }
1214 local_irq_restore(flags);
1215}
1216
1217static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
1218{
1219 int len = 0, ret;
1220
1221 while (len < RELATIVEJUMP_SIZE) {
1222 ret = __copy_instruction(dest + len, src + len, 1);
1223 if (!ret || !can_boost(dest + len))
1224 return -EINVAL;
1225 len += ret;
1226 }
1227 /* Check whether the address range is reserved */
1228 if (ftrace_text_reserved(src, src + len - 1) ||
1229 alternatives_text_reserved(src, src + len - 1) ||
1230 jump_label_text_reserved(src, src + len - 1))
1231 return -EBUSY;
1232
1233 return len;
1234}
1235
1236/* Check whether insn is indirect jump */
1237static int __kprobes insn_is_indirect_jump(struct insn *insn)
1238{
1239 return ((insn->opcode.bytes[0] == 0xff &&
1240 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
1241 insn->opcode.bytes[0] == 0xea); /* Segment based jump */
1242}
1243
1244/* Check whether insn jumps into specified address range */
1245static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
1246{
1247 unsigned long target = 0;
1248
1249 switch (insn->opcode.bytes[0]) {
1250 case 0xe0: /* loopne */
1251 case 0xe1: /* loope */
1252 case 0xe2: /* loop */
1253 case 0xe3: /* jcxz */
1254 case 0xe9: /* near relative jump */
1255 case 0xeb: /* short relative jump */
1256 break;
1257 case 0x0f:
1258 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
1259 break;
1260 return 0;
1261 default:
1262 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
1263 break;
1264 return 0;
1265 }
1266 target = (unsigned long)insn->next_byte + insn->immediate.value;
1267
1268 return (start <= target && target <= start + len);
1269}
1270
1271/* Decode whole function to ensure any instructions don't jump into target */
1272static int __kprobes can_optimize(unsigned long paddr)
1273{
1274 int ret;
1275 unsigned long addr, size = 0, offset = 0;
1276 struct insn insn;
1277 kprobe_opcode_t buf[MAX_INSN_SIZE];
1278
1279 /* Lookup symbol including addr */
1280 if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
1281 return 0;
1282
1283 /*
1284 * Do not optimize in the entry code due to the unstable
1285 * stack handling.
1286 */
1287 if ((paddr >= (unsigned long )__entry_text_start) &&
1288 (paddr < (unsigned long )__entry_text_end))
1289 return 0;
1290
1291 /* Check there is enough space for a relative jump. */
1292 if (size - offset < RELATIVEJUMP_SIZE)
1293 return 0;
1294
1295 /* Decode instructions */
1296 addr = paddr - offset;
1297 while (addr < paddr - offset + size) { /* Decode until function end */
1298 if (search_exception_tables(addr))
1299 /*
1300 * Since some fixup code will jumps into this function,
1301 * we can't optimize kprobe in this function.
1302 */
1303 return 0;
1304 kernel_insn_init(&insn, (void *)addr);
1305 insn_get_opcode(&insn);
1306 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
1307 ret = recover_probed_instruction(buf, addr);
1308 if (ret)
1309 return 0;
1310 kernel_insn_init(&insn, buf);
1311 }
1312 insn_get_length(&insn);
1313 /* Recover address */
1314 insn.kaddr = (void *)addr;
1315 insn.next_byte = (void *)(addr + insn.length);
1316 /* Check any instructions don't jump into target */
1317 if (insn_is_indirect_jump(&insn) ||
1318 insn_jump_into_range(&insn, paddr + INT3_SIZE,
1319 RELATIVE_ADDR_SIZE))
1320 return 0;
1321 addr += insn.length;
1322 }
1323
1324 return 1;
1325}
1326
1327/* Check optimized_kprobe can actually be optimized. */
1328int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
1329{
1330 int i;
1331 struct kprobe *p;
1332
1333 for (i = 1; i < op->optinsn.size; i++) {
1334 p = get_kprobe(op->kp.addr + i);
1335 if (p && !kprobe_disabled(p))
1336 return -EEXIST;
1337 }
1338
1339 return 0;
1340}
1341
1342/* Check the addr is within the optimized instructions. */
1343int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
1344 unsigned long addr)
1345{
1346 return ((unsigned long)op->kp.addr <= addr &&
1347 (unsigned long)op->kp.addr + op->optinsn.size > addr);
1348}
1349
1350/* Free optimized instruction slot */
1351static __kprobes
1352void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
1353{
1354 if (op->optinsn.insn) {
1355 free_optinsn_slot(op->optinsn.insn, dirty);
1356 op->optinsn.insn = NULL;
1357 op->optinsn.size = 0;
1358 }
1359}
1360
1361void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
1362{
1363 __arch_remove_optimized_kprobe(op, 1);
1364}
1365
1366/*
1367 * Copy replacing target instructions
1368 * Target instructions MUST be relocatable (checked inside)
1369 */
1370int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1371{
1372 u8 *buf;
1373 int ret;
1374 long rel;
1375
1376 if (!can_optimize((unsigned long)op->kp.addr))
1377 return -EILSEQ;
1378
1379 op->optinsn.insn = get_optinsn_slot();
1380 if (!op->optinsn.insn)
1381 return -ENOMEM;
1382
1383 /*
1384 * Verify if the address gap is in 2GB range, because this uses
1385 * a relative jump.
1386 */
1387 rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
1388 if (abs(rel) > 0x7fffffff)
1389 return -ERANGE;
1390
1391 buf = (u8 *)op->optinsn.insn;
1392
1393 /* Copy instructions into the out-of-line buffer */
1394 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
1395 if (ret < 0) {
1396 __arch_remove_optimized_kprobe(op, 0);
1397 return ret;
1398 }
1399 op->optinsn.size = ret;
1400
1401 /* Copy arch-dep-instance from template */
1402 memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
1403
1404 /* Set probe information */
1405 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
1406
1407 /* Set probe function call */
1408 synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
1409
1410 /* Set returning jmp instruction at the tail of out-of-line buffer */
1411 synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
1412 (u8 *)op->kp.addr + op->optinsn.size);
1413
1414 flush_icache_range((unsigned long) buf,
1415 (unsigned long) buf + TMPL_END_IDX +
1416 op->optinsn.size + RELATIVEJUMP_SIZE);
1417 return 0;
1418}
1419
1420#define MAX_OPTIMIZE_PROBES 256
1421static struct text_poke_param *jump_poke_params;
1422static struct jump_poke_buffer {
1423 u8 buf[RELATIVEJUMP_SIZE];
1424} *jump_poke_bufs;
1425
1426static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
1427 u8 *insn_buf,
1428 struct optimized_kprobe *op)
1429{
1430 s32 rel = (s32)((long)op->optinsn.insn -
1431 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1432
1433 /* Backup instructions which will be replaced by jump address */
1434 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1435 RELATIVE_ADDR_SIZE);
1436
1437 insn_buf[0] = RELATIVEJUMP_OPCODE;
1438 *(s32 *)(&insn_buf[1]) = rel;
1439
1440 tprm->addr = op->kp.addr;
1441 tprm->opcode = insn_buf;
1442 tprm->len = RELATIVEJUMP_SIZE;
1443}
1444
1445/*
1446 * Replace breakpoints (int3) with relative jumps.
1447 * Caller must call with locking kprobe_mutex and text_mutex.
1448 */
1449void __kprobes arch_optimize_kprobes(struct list_head *oplist)
1450{
1451 struct optimized_kprobe *op, *tmp;
1452 int c = 0;
1453
1454 list_for_each_entry_safe(op, tmp, oplist, list) {
1455 WARN_ON(kprobe_disabled(&op->kp));
1456 /* Setup param */
1457 setup_optimize_kprobe(&jump_poke_params[c],
1458 jump_poke_bufs[c].buf, op);
1459 list_del_init(&op->list);
1460 if (++c >= MAX_OPTIMIZE_PROBES)
1461 break;
1462 }
1463
1464 /*
1465 * text_poke_smp doesn't support NMI/MCE code modifying.
1466 * However, since kprobes itself also doesn't support NMI/MCE
1467 * code probing, it's not a problem.
1468 */
1469 text_poke_smp_batch(jump_poke_params, c);
1470}
1471
1472static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
1473 u8 *insn_buf,
1474 struct optimized_kprobe *op)
1475{
1476 /* Set int3 to first byte for kprobes */
1477 insn_buf[0] = BREAKPOINT_INSTRUCTION;
1478 memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1479
1480 tprm->addr = op->kp.addr;
1481 tprm->opcode = insn_buf;
1482 tprm->len = RELATIVEJUMP_SIZE;
1483}
1484
1485/*
1486 * Recover original instructions and breakpoints from relative jumps.
1487 * Caller must call with locking kprobe_mutex.
1488 */
1489extern void arch_unoptimize_kprobes(struct list_head *oplist,
1490 struct list_head *done_list)
1491{
1492 struct optimized_kprobe *op, *tmp;
1493 int c = 0;
1494
1495 list_for_each_entry_safe(op, tmp, oplist, list) {
1496 /* Setup param */
1497 setup_unoptimize_kprobe(&jump_poke_params[c],
1498 jump_poke_bufs[c].buf, op);
1499 list_move(&op->list, done_list);
1500 if (++c >= MAX_OPTIMIZE_PROBES)
1501 break;
1502 }
1503
1504 /*
1505 * text_poke_smp doesn't support NMI/MCE code modifying.
1506 * However, since kprobes itself also doesn't support NMI/MCE
1507 * code probing, it's not a problem.
1508 */
1509 text_poke_smp_batch(jump_poke_params, c);
1510}
1511
1512/* Replace a relative jump with a breakpoint (int3). */
1513void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
1514{
1515 u8 buf[RELATIVEJUMP_SIZE];
1516
1517 /* Set int3 to first byte for kprobes */
1518 buf[0] = BREAKPOINT_INSTRUCTION;
1519 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1520 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
1521}
1522
1523static int __kprobes setup_detour_execution(struct kprobe *p,
1524 struct pt_regs *regs,
1525 int reenter)
1526{
1527 struct optimized_kprobe *op;
1528
1529 if (p->flags & KPROBE_FLAG_OPTIMIZED) {
1530 /* This kprobe is really able to run optimized path. */
1531 op = container_of(p, struct optimized_kprobe, kp);
1532 /* Detour through copied instructions */
1533 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
1534 if (!reenter)
1535 reset_current_kprobe();
1536 preempt_enable_no_resched();
1537 return 1;
1538 }
1539 return 0;
1540}
1541
1542static int __kprobes init_poke_params(void)
1543{
1544 /* Allocate code buffer and parameter array */
1545 jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
1546 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1547 if (!jump_poke_bufs)
1548 return -ENOMEM;
1549
1550 jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
1551 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1552 if (!jump_poke_params) {
1553 kfree(jump_poke_bufs);
1554 jump_poke_bufs = NULL;
1555 return -ENOMEM;
1556 }
1557
1558 return 0;
1559}
1560#else /* !CONFIG_OPTPROBES */
1561static int __kprobes init_poke_params(void)
1562{
1563 return 0;
1564}
1565#endif
1566
1567int __init arch_init_kprobes(void) 1055int __init arch_init_kprobes(void)
1568{ 1056{
1569 return init_poke_params(); 1057 return arch_init_optprobes();
1570} 1058}
1571 1059
1572int __kprobes arch_trampoline_kprobe(struct kprobe *p) 1060int __kprobes arch_trampoline_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index f0c6fd6f176b..694d801bf606 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -438,9 +438,9 @@ void __init kvm_guest_init(void)
438static __init int activate_jump_labels(void) 438static __init int activate_jump_labels(void)
439{ 439{
440 if (has_steal_clock) { 440 if (has_steal_clock) {
441 jump_label_inc(&paravirt_steal_enabled); 441 static_key_slow_inc(&paravirt_steal_enabled);
442 if (steal_acc) 442 if (steal_acc)
443 jump_label_inc(&paravirt_steal_rq_enabled); 443 static_key_slow_inc(&paravirt_steal_rq_enabled);
444 } 444 }
445 445
446 return 0; 446 return 0;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 44842d756b29..f8492da65bfc 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -136,6 +136,15 @@ int kvm_register_clock(char *txt)
136 return ret; 136 return ret;
137} 137}
138 138
139static void kvm_save_sched_clock_state(void)
140{
141}
142
143static void kvm_restore_sched_clock_state(void)
144{
145 kvm_register_clock("primary cpu clock, resume");
146}
147
139#ifdef CONFIG_X86_LOCAL_APIC 148#ifdef CONFIG_X86_LOCAL_APIC
140static void __cpuinit kvm_setup_secondary_clock(void) 149static void __cpuinit kvm_setup_secondary_clock(void)
141{ 150{
@@ -144,8 +153,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
144 * we shouldn't fail. 153 * we shouldn't fail.
145 */ 154 */
146 WARN_ON(kvm_register_clock("secondary cpu clock")); 155 WARN_ON(kvm_register_clock("secondary cpu clock"));
147 /* ok, done with our trickery, call native */
148 setup_secondary_APIC_clock();
149} 156}
150#endif 157#endif
151 158
@@ -194,9 +201,11 @@ void __init kvmclock_init(void)
194 x86_platform.get_wallclock = kvm_get_wallclock; 201 x86_platform.get_wallclock = kvm_get_wallclock;
195 x86_platform.set_wallclock = kvm_set_wallclock; 202 x86_platform.set_wallclock = kvm_set_wallclock;
196#ifdef CONFIG_X86_LOCAL_APIC 203#ifdef CONFIG_X86_LOCAL_APIC
197 x86_cpuinit.setup_percpu_clockev = 204 x86_cpuinit.early_percpu_clock_init =
198 kvm_setup_secondary_clock; 205 kvm_setup_secondary_clock;
199#endif 206#endif
207 x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
208 x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
200 machine_ops.shutdown = kvm_shutdown; 209 machine_ops.shutdown = kvm_shutdown;
201#ifdef CONFIG_KEXEC 210#ifdef CONFIG_KEXEC
202 machine_ops.crash_shutdown = kvm_crash_shutdown; 211 machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index fe86493f3ed1..73465aab28f8 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -311,13 +311,33 @@ out:
311 return state; 311 return state;
312} 312}
313 313
314/*
315 * AMD microcode firmware naming convention, up to family 15h they are in
316 * the legacy file:
317 *
318 * amd-ucode/microcode_amd.bin
319 *
320 * This legacy file is always smaller than 2K in size.
321 *
322 * Starting at family 15h they are in family specific firmware files:
323 *
324 * amd-ucode/microcode_amd_fam15h.bin
325 * amd-ucode/microcode_amd_fam16h.bin
326 * ...
327 *
328 * These might be larger than 2K.
329 */
314static enum ucode_state request_microcode_amd(int cpu, struct device *device) 330static enum ucode_state request_microcode_amd(int cpu, struct device *device)
315{ 331{
316 const char *fw_name = "amd-ucode/microcode_amd.bin"; 332 char fw_name[36] = "amd-ucode/microcode_amd.bin";
317 const struct firmware *fw; 333 const struct firmware *fw;
318 enum ucode_state ret = UCODE_NFOUND; 334 enum ucode_state ret = UCODE_NFOUND;
335 struct cpuinfo_x86 *c = &cpu_data(cpu);
336
337 if (c->x86 >= 0x15)
338 snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
319 339
320 if (request_firmware(&fw, fw_name, device)) { 340 if (request_firmware(&fw, (const char *)fw_name, device)) {
321 pr_err("failed to load file %s\n", fw_name); 341 pr_err("failed to load file %s\n", fw_name);
322 goto out; 342 goto out;
323 } 343 }
@@ -340,7 +360,6 @@ out:
340static enum ucode_state 360static enum ucode_state
341request_microcode_user(int cpu, const void __user *buf, size_t size) 361request_microcode_user(int cpu, const void __user *buf, size_t size)
342{ 362{
343 pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
344 return UCODE_ERROR; 363 return UCODE_ERROR;
345} 364}
346 365
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fda91c307104..87a0f8688301 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -86,6 +86,7 @@
86 86
87#include <asm/microcode.h> 87#include <asm/microcode.h>
88#include <asm/processor.h> 88#include <asm/processor.h>
89#include <asm/cpu_device_id.h>
89 90
90MODULE_DESCRIPTION("Microcode Update Driver"); 91MODULE_DESCRIPTION("Microcode Update Driver");
91MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 92MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -504,6 +505,20 @@ static struct notifier_block __refdata mc_cpu_notifier = {
504 .notifier_call = mc_cpu_callback, 505 .notifier_call = mc_cpu_callback,
505}; 506};
506 507
508#ifdef MODULE
509/* Autoload on Intel and AMD systems */
510static const struct x86_cpu_id microcode_id[] = {
511#ifdef CONFIG_MICROCODE_INTEL
512 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
513#endif
514#ifdef CONFIG_MICROCODE_AMD
515 { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, },
516#endif
517 {}
518};
519MODULE_DEVICE_TABLE(x86cpu, microcode_id);
520#endif
521
507static int __init microcode_init(void) 522static int __init microcode_init(void)
508{ 523{
509 struct cpuinfo_x86 *c = &cpu_data(0); 524 struct cpuinfo_x86 *c = &cpu_data(0);
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 0d01a8ea4e11..2c39dcd510fa 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -12,6 +12,7 @@
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/cpumask.h> 13#include <linux/cpumask.h>
14#include <linux/delay.h> 14#include <linux/delay.h>
15#include <linux/init.h>
15 16
16#include <asm/apic.h> 17#include <asm/apic.h>
17#include <asm/nmi.h> 18#include <asm/nmi.h>
@@ -20,35 +21,35 @@
20#define FAILURE 1 21#define FAILURE 1
21#define TIMEOUT 2 22#define TIMEOUT 2
22 23
23static int nmi_fail; 24static int __initdata nmi_fail;
24 25
25/* check to see if NMI IPIs work on this machine */ 26/* check to see if NMI IPIs work on this machine */
26static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly; 27static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata;
27 28
28static int testcase_total; 29static int __initdata testcase_total;
29static int testcase_successes; 30static int __initdata testcase_successes;
30static int expected_testcase_failures; 31static int __initdata expected_testcase_failures;
31static int unexpected_testcase_failures; 32static int __initdata unexpected_testcase_failures;
32static int unexpected_testcase_unknowns; 33static int __initdata unexpected_testcase_unknowns;
33 34
34static int nmi_unk_cb(unsigned int val, struct pt_regs *regs) 35static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
35{ 36{
36 unexpected_testcase_unknowns++; 37 unexpected_testcase_unknowns++;
37 return NMI_HANDLED; 38 return NMI_HANDLED;
38} 39}
39 40
40static void init_nmi_testsuite(void) 41static void __init init_nmi_testsuite(void)
41{ 42{
42 /* trap all the unknown NMIs we may generate */ 43 /* trap all the unknown NMIs we may generate */
43 register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); 44 register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
44} 45}
45 46
46static void cleanup_nmi_testsuite(void) 47static void __init cleanup_nmi_testsuite(void)
47{ 48{
48 unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk"); 49 unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
49} 50}
50 51
51static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) 52static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
52{ 53{
53 int cpu = raw_smp_processor_id(); 54 int cpu = raw_smp_processor_id();
54 55
@@ -58,7 +59,7 @@ static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
58 return NMI_DONE; 59 return NMI_DONE;
59} 60}
60 61
61static void test_nmi_ipi(struct cpumask *mask) 62static void __init test_nmi_ipi(struct cpumask *mask)
62{ 63{
63 unsigned long timeout; 64 unsigned long timeout;
64 65
@@ -86,7 +87,7 @@ static void test_nmi_ipi(struct cpumask *mask)
86 return; 87 return;
87} 88}
88 89
89static void remote_ipi(void) 90static void __init remote_ipi(void)
90{ 91{
91 cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); 92 cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
92 cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); 93 cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
@@ -94,19 +95,19 @@ static void remote_ipi(void)
94 test_nmi_ipi(to_cpumask(nmi_ipi_mask)); 95 test_nmi_ipi(to_cpumask(nmi_ipi_mask));
95} 96}
96 97
97static void local_ipi(void) 98static void __init local_ipi(void)
98{ 99{
99 cpumask_clear(to_cpumask(nmi_ipi_mask)); 100 cpumask_clear(to_cpumask(nmi_ipi_mask));
100 cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); 101 cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
101 test_nmi_ipi(to_cpumask(nmi_ipi_mask)); 102 test_nmi_ipi(to_cpumask(nmi_ipi_mask));
102} 103}
103 104
104static void reset_nmi(void) 105static void __init reset_nmi(void)
105{ 106{
106 nmi_fail = 0; 107 nmi_fail = 0;
107} 108}
108 109
109static void dotest(void (*testcase_fn)(void), int expected) 110static void __init dotest(void (*testcase_fn)(void), int expected)
110{ 111{
111 testcase_fn(); 112 testcase_fn();
112 /* 113 /*
@@ -131,12 +132,12 @@ static void dotest(void (*testcase_fn)(void), int expected)
131 reset_nmi(); 132 reset_nmi();
132} 133}
133 134
134static inline void print_testname(const char *testname) 135static inline void __init print_testname(const char *testname)
135{ 136{
136 printk("%12s:", testname); 137 printk("%12s:", testname);
137} 138}
138 139
139void nmi_selftest(void) 140void __init nmi_selftest(void)
140{ 141{
141 init_nmi_testsuite(); 142 init_nmi_testsuite();
142 143
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index d90272e6bc40..9c57c02e54f6 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -26,6 +26,7 @@
26 26
27#include <asm/bug.h> 27#include <asm/bug.h>
28#include <asm/paravirt.h> 28#include <asm/paravirt.h>
29#include <asm/debugreg.h>
29#include <asm/desc.h> 30#include <asm/desc.h>
30#include <asm/setup.h> 31#include <asm/setup.h>
31#include <asm/pgtable.h> 32#include <asm/pgtable.h>
@@ -202,8 +203,8 @@ static void native_flush_tlb_single(unsigned long addr)
202 __native_flush_tlb_single(addr); 203 __native_flush_tlb_single(addr);
203} 204}
204 205
205struct jump_label_key paravirt_steal_enabled; 206struct static_key paravirt_steal_enabled;
206struct jump_label_key paravirt_steal_rq_enabled; 207struct static_key paravirt_steal_rq_enabled;
207 208
208static u64 native_steal_clock(int cpu) 209static u64 native_steal_clock(int cpu)
209{ 210{
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1c4d769e21ea..28e5e06fcba4 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -262,10 +262,11 @@ rootfs_initcall(pci_iommu_init);
262 262
263static __devinit void via_no_dac(struct pci_dev *dev) 263static __devinit void via_no_dac(struct pci_dev *dev)
264{ 264{
265 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { 265 if (forbid_dac == 0) {
266 dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); 266 dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
267 forbid_dac = 1; 267 forbid_dac = 1;
268 } 268 }
269} 269}
270DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); 270DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
271 PCI_CLASS_BRIDGE_PCI, 8, via_no_dac);
271#endif 272#endif
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 34e06e84ce31..0bc72e2069e3 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -12,6 +12,7 @@
12#include <linux/pci.h> 12#include <linux/pci.h>
13#include <linux/export.h> 13#include <linux/export.h>
14 14
15#include <asm/probe_roms.h>
15#include <asm/pci-direct.h> 16#include <asm/pci-direct.h>
16#include <asm/e820.h> 17#include <asm/e820.h>
17#include <asm/mmzone.h> 18#include <asm/mmzone.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 15763af7bfe3..14baf78d5a1f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -21,6 +21,7 @@
21#include <asm/idle.h> 21#include <asm/idle.h>
22#include <asm/uaccess.h> 22#include <asm/uaccess.h>
23#include <asm/i387.h> 23#include <asm/i387.h>
24#include <asm/fpu-internal.h>
24#include <asm/debugreg.h> 25#include <asm/debugreg.h>
25 26
26struct kmem_cache *task_xstate_cachep; 27struct kmem_cache *task_xstate_cachep;
@@ -377,8 +378,8 @@ static inline int hlt_use_halt(void)
377void default_idle(void) 378void default_idle(void)
378{ 379{
379 if (hlt_use_halt()) { 380 if (hlt_use_halt()) {
380 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 381 trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
381 trace_cpu_idle(1, smp_processor_id()); 382 trace_cpu_idle_rcuidle(1, smp_processor_id());
382 current_thread_info()->status &= ~TS_POLLING; 383 current_thread_info()->status &= ~TS_POLLING;
383 /* 384 /*
384 * TS_POLLING-cleared state must be visible before we 385 * TS_POLLING-cleared state must be visible before we
@@ -391,8 +392,8 @@ void default_idle(void)
391 else 392 else
392 local_irq_enable(); 393 local_irq_enable();
393 current_thread_info()->status |= TS_POLLING; 394 current_thread_info()->status |= TS_POLLING;
394 trace_power_end(smp_processor_id()); 395 trace_power_end_rcuidle(smp_processor_id());
395 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); 396 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
396 } else { 397 } else {
397 local_irq_enable(); 398 local_irq_enable();
398 /* loop is done by the caller */ 399 /* loop is done by the caller */
@@ -450,8 +451,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
450static void mwait_idle(void) 451static void mwait_idle(void)
451{ 452{
452 if (!need_resched()) { 453 if (!need_resched()) {
453 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 454 trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
454 trace_cpu_idle(1, smp_processor_id()); 455 trace_cpu_idle_rcuidle(1, smp_processor_id());
455 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) 456 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
456 clflush((void *)&current_thread_info()->flags); 457 clflush((void *)&current_thread_info()->flags);
457 458
@@ -461,8 +462,8 @@ static void mwait_idle(void)
461 __sti_mwait(0, 0); 462 __sti_mwait(0, 0);
462 else 463 else
463 local_irq_enable(); 464 local_irq_enable();
464 trace_power_end(smp_processor_id()); 465 trace_power_end_rcuidle(smp_processor_id());
465 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); 466 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
466 } else 467 } else
467 local_irq_enable(); 468 local_irq_enable();
468} 469}
@@ -474,13 +475,13 @@ static void mwait_idle(void)
474 */ 475 */
475static void poll_idle(void) 476static void poll_idle(void)
476{ 477{
477 trace_power_start(POWER_CSTATE, 0, smp_processor_id()); 478 trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id());
478 trace_cpu_idle(0, smp_processor_id()); 479 trace_cpu_idle_rcuidle(0, smp_processor_id());
479 local_irq_enable(); 480 local_irq_enable();
480 while (!need_resched()) 481 while (!need_resched())
481 cpu_relax(); 482 cpu_relax();
482 trace_power_end(smp_processor_id()); 483 trace_power_end_rcuidle(smp_processor_id());
483 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); 484 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
484} 485}
485 486
486/* 487/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 485204f58cda..9d7d4842bfaf 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -45,6 +45,7 @@
45#include <asm/ldt.h> 45#include <asm/ldt.h>
46#include <asm/processor.h> 46#include <asm/processor.h>
47#include <asm/i387.h> 47#include <asm/i387.h>
48#include <asm/fpu-internal.h>
48#include <asm/desc.h> 49#include <asm/desc.h>
49#ifdef CONFIG_MATH_EMULATION 50#ifdef CONFIG_MATH_EMULATION
50#include <asm/math_emu.h> 51#include <asm/math_emu.h>
@@ -119,9 +120,7 @@ void cpu_idle(void)
119 } 120 }
120 rcu_idle_exit(); 121 rcu_idle_exit();
121 tick_nohz_idle_exit(); 122 tick_nohz_idle_exit();
122 preempt_enable_no_resched(); 123 schedule_preempt_disabled();
123 schedule();
124 preempt_disable();
125 } 124 }
126} 125}
127 126
@@ -214,6 +213,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
214 213
215 task_user_gs(p) = get_user_gs(regs); 214 task_user_gs(p) = get_user_gs(regs);
216 215
216 p->fpu_counter = 0;
217 p->thread.io_bitmap_ptr = NULL; 217 p->thread.io_bitmap_ptr = NULL;
218 tsk = current; 218 tsk = current;
219 err = -ENOMEM; 219 err = -ENOMEM;
@@ -299,22 +299,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
299 *next = &next_p->thread; 299 *next = &next_p->thread;
300 int cpu = smp_processor_id(); 300 int cpu = smp_processor_id();
301 struct tss_struct *tss = &per_cpu(init_tss, cpu); 301 struct tss_struct *tss = &per_cpu(init_tss, cpu);
302 bool preload_fpu; 302 fpu_switch_t fpu;
303 303
304 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ 304 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
305 305
306 /* 306 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
307 * If the task has used fpu the last 5 timeslices, just do a full
308 * restore of the math state immediately to avoid the trap; the
309 * chances of needing FPU soon are obviously high now
310 */
311 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
312
313 __unlazy_fpu(prev_p);
314
315 /* we're going to use this soon, after a few expensive things */
316 if (preload_fpu)
317 prefetch(next->fpu.state);
318 307
319 /* 308 /*
320 * Reload esp0. 309 * Reload esp0.
@@ -354,11 +343,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
354 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) 343 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
355 __switch_to_xtra(prev_p, next_p, tss); 344 __switch_to_xtra(prev_p, next_p, tss);
356 345
357 /* If we're going to preload the fpu context, make sure clts
358 is run while we're batching the cpu state updates. */
359 if (preload_fpu)
360 clts();
361
362 /* 346 /*
363 * Leave lazy mode, flushing any hypercalls made here. 347 * Leave lazy mode, flushing any hypercalls made here.
364 * This must be done before restoring TLS segments so 348 * This must be done before restoring TLS segments so
@@ -368,15 +352,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
368 */ 352 */
369 arch_end_context_switch(next_p); 353 arch_end_context_switch(next_p);
370 354
371 if (preload_fpu)
372 __math_state_restore();
373
374 /* 355 /*
375 * Restore %gs if needed (which is common) 356 * Restore %gs if needed (which is common)
376 */ 357 */
377 if (prev->gs | next->gs) 358 if (prev->gs | next->gs)
378 lazy_load_gs(next->gs); 359 lazy_load_gs(next->gs);
379 360
361 switch_fpu_finish(next_p, fpu);
362
380 percpu_write(current_task, next_p); 363 percpu_write(current_task, next_p);
381 364
382 return prev_p; 365 return prev_p;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9b9fe4a85c87..292da13fc5aa 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -43,6 +43,7 @@
43#include <asm/system.h> 43#include <asm/system.h>
44#include <asm/processor.h> 44#include <asm/processor.h>
45#include <asm/i387.h> 45#include <asm/i387.h>
46#include <asm/fpu-internal.h>
46#include <asm/mmu_context.h> 47#include <asm/mmu_context.h>
47#include <asm/prctl.h> 48#include <asm/prctl.h>
48#include <asm/desc.h> 49#include <asm/desc.h>
@@ -156,9 +157,7 @@ void cpu_idle(void)
156 } 157 }
157 158
158 tick_nohz_idle_exit(); 159 tick_nohz_idle_exit();
159 preempt_enable_no_resched(); 160 schedule_preempt_disabled();
160 schedule();
161 preempt_disable();
162 } 161 }
163} 162}
164 163
@@ -286,6 +285,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
286 285
287 set_tsk_thread_flag(p, TIF_FORK); 286 set_tsk_thread_flag(p, TIF_FORK);
288 287
288 p->fpu_counter = 0;
289 p->thread.io_bitmap_ptr = NULL; 289 p->thread.io_bitmap_ptr = NULL;
290 290
291 savesegment(gs, p->thread.gsindex); 291 savesegment(gs, p->thread.gsindex);
@@ -341,6 +341,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
341 loadsegment(es, _ds); 341 loadsegment(es, _ds);
342 loadsegment(ds, _ds); 342 loadsegment(ds, _ds);
343 load_gs_index(0); 343 load_gs_index(0);
344 current->thread.usersp = new_sp;
344 regs->ip = new_ip; 345 regs->ip = new_ip;
345 regs->sp = new_sp; 346 regs->sp = new_sp;
346 percpu_write(old_rsp, new_sp); 347 percpu_write(old_rsp, new_sp);
@@ -386,18 +387,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
386 int cpu = smp_processor_id(); 387 int cpu = smp_processor_id();
387 struct tss_struct *tss = &per_cpu(init_tss, cpu); 388 struct tss_struct *tss = &per_cpu(init_tss, cpu);
388 unsigned fsindex, gsindex; 389 unsigned fsindex, gsindex;
389 bool preload_fpu; 390 fpu_switch_t fpu;
390 391
391 /* 392 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
392 * If the task has used fpu the last 5 timeslices, just do a full
393 * restore of the math state immediately to avoid the trap; the
394 * chances of needing FPU soon are obviously high now
395 */
396 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
397
398 /* we're going to use this soon, after a few expensive things */
399 if (preload_fpu)
400 prefetch(next->fpu.state);
401 393
402 /* 394 /*
403 * Reload esp0, LDT and the page table pointer: 395 * Reload esp0, LDT and the page table pointer:
@@ -427,13 +419,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
427 419
428 load_TLS(next, cpu); 420 load_TLS(next, cpu);
429 421
430 /* Must be after DS reload */
431 __unlazy_fpu(prev_p);
432
433 /* Make sure cpu is ready for new context */
434 if (preload_fpu)
435 clts();
436
437 /* 422 /*
438 * Leave lazy mode, flushing any hypercalls made here. 423 * Leave lazy mode, flushing any hypercalls made here.
439 * This must be done before restoring TLS segments so 424 * This must be done before restoring TLS segments so
@@ -474,6 +459,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
474 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 459 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
475 prev->gsindex = gsindex; 460 prev->gsindex = gsindex;
476 461
462 switch_fpu_finish(next_p, fpu);
463
477 /* 464 /*
478 * Switch the PDA and FPU contexts. 465 * Switch the PDA and FPU contexts.
479 */ 466 */
@@ -492,13 +479,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
492 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) 479 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
493 __switch_to_xtra(prev_p, next_p, tss); 480 __switch_to_xtra(prev_p, next_p, tss);
494 481
495 /*
496 * Preload the FPU context, now that we've determined that the
497 * task is likely to be using it.
498 */
499 if (preload_fpu)
500 __math_state_restore();
501
502 return prev_p; 482 return prev_p;
503} 483}
504 484
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 50267386b766..78f05e438be5 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -27,6 +27,7 @@
27#include <asm/system.h> 27#include <asm/system.h>
28#include <asm/processor.h> 28#include <asm/processor.h>
29#include <asm/i387.h> 29#include <asm/i387.h>
30#include <asm/fpu-internal.h>
30#include <asm/debugreg.h> 31#include <asm/debugreg.h>
31#include <asm/ldt.h> 32#include <asm/ldt.h>
32#include <asm/desc.h> 33#include <asm/desc.h>
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 37a458b521a6..d840e69a853c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -39,6 +39,14 @@ static int reboot_mode;
39enum reboot_type reboot_type = BOOT_ACPI; 39enum reboot_type reboot_type = BOOT_ACPI;
40int reboot_force; 40int reboot_force;
41 41
42/* This variable is used privately to keep track of whether or not
43 * reboot_type is still set to its default value (i.e., reboot= hasn't
44 * been set on the command line). This is needed so that we can
45 * suppress DMI scanning for reboot quirks. Without it, it's
46 * impossible to override a faulty reboot quirk without recompiling.
47 */
48static int reboot_default = 1;
49
42#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 50#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
43static int reboot_cpu = -1; 51static int reboot_cpu = -1;
44#endif 52#endif
@@ -67,6 +75,12 @@ bool port_cf9_safe = false;
67static int __init reboot_setup(char *str) 75static int __init reboot_setup(char *str)
68{ 76{
69 for (;;) { 77 for (;;) {
78 /* Having anything passed on the command line via
79 * reboot= will cause us to disable DMI checking
80 * below.
81 */
82 reboot_default = 0;
83
70 switch (*str) { 84 switch (*str) {
71 case 'w': 85 case 'w':
72 reboot_mode = 0x1234; 86 reboot_mode = 0x1234;
@@ -295,14 +309,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
295 DMI_MATCH(DMI_BOARD_NAME, "P4S800"), 309 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
296 }, 310 },
297 }, 311 },
298 { /* Handle problems with rebooting on VersaLogic Menlow boards */
299 .callback = set_bios_reboot,
300 .ident = "VersaLogic Menlow based board",
301 .matches = {
302 DMI_MATCH(DMI_BOARD_VENDOR, "VersaLogic Corporation"),
303 DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"),
304 },
305 },
306 { /* Handle reboot issue on Acer Aspire one */ 312 { /* Handle reboot issue on Acer Aspire one */
307 .callback = set_kbd_reboot, 313 .callback = set_kbd_reboot,
308 .ident = "Acer Aspire One A110", 314 .ident = "Acer Aspire One A110",
@@ -316,7 +322,12 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
316 322
317static int __init reboot_init(void) 323static int __init reboot_init(void)
318{ 324{
319 dmi_check_system(reboot_dmi_table); 325 /* Only do the DMI check if reboot_type hasn't been overridden
326 * on the command line
327 */
328 if (reboot_default) {
329 dmi_check_system(reboot_dmi_table);
330 }
320 return 0; 331 return 0;
321} 332}
322core_initcall(reboot_init); 333core_initcall(reboot_init);
@@ -465,7 +476,12 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
465 476
466static int __init pci_reboot_init(void) 477static int __init pci_reboot_init(void)
467{ 478{
468 dmi_check_system(pci_reboot_dmi_table); 479 /* Only do the DMI check if reboot_type hasn't been overridden
480 * on the command line
481 */
482 if (reboot_default) {
483 dmi_check_system(pci_reboot_dmi_table);
484 }
469 return 0; 485 return 0;
470} 486}
471core_initcall(pci_reboot_init); 487core_initcall(pci_reboot_init);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d7d5099fe874..88638883176a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -749,10 +749,16 @@ void __init setup_arch(char **cmdline_p)
749#endif 749#endif
750#ifdef CONFIG_EFI 750#ifdef CONFIG_EFI
751 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, 751 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
752 EFI_LOADER_SIGNATURE, 4)) { 752 "EL32", 4)) {
753 efi_enabled = 1; 753 efi_enabled = 1;
754 efi_memblock_x86_reserve_range(); 754 efi_64bit = false;
755 } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
756 "EL64", 4)) {
757 efi_enabled = 1;
758 efi_64bit = true;
755 } 759 }
760 if (efi_enabled && efi_memblock_x86_reserve_range())
761 efi_enabled = 0;
756#endif 762#endif
757 763
758 x86_init.oem.arch_setup(); 764 x86_init.oem.arch_setup();
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 46a01bdc27e2..25edcfc9ba5b 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -24,6 +24,7 @@
24#include <asm/processor.h> 24#include <asm/processor.h>
25#include <asm/ucontext.h> 25#include <asm/ucontext.h>
26#include <asm/i387.h> 26#include <asm/i387.h>
27#include <asm/fpu-internal.h>
27#include <asm/vdso.h> 28#include <asm/vdso.h>
28#include <asm/mce.h> 29#include <asm/mce.h>
29 30
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 66d250c00d11..5104a2b685cf 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -255,6 +255,7 @@ notrace static void __cpuinit start_secondary(void *unused)
255 * most necessary things. 255 * most necessary things.
256 */ 256 */
257 cpu_init(); 257 cpu_init();
258 x86_cpuinit.early_percpu_clock_init();
258 preempt_disable(); 259 preempt_disable();
259 smp_callin(); 260 smp_callin();
260 261
@@ -291,19 +292,6 @@ notrace static void __cpuinit start_secondary(void *unused)
291 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 292 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
292 x86_platform.nmi_init(); 293 x86_platform.nmi_init();
293 294
294 /*
295 * Wait until the cpu which brought this one up marked it
296 * online before enabling interrupts. If we don't do that then
297 * we can end up waking up the softirq thread before this cpu
298 * reached the active state, which makes the scheduler unhappy
299 * and schedule the softirq thread on the wrong cpu. This is
300 * only observable with forced threaded interrupts, but in
301 * theory it could also happen w/o them. It's just way harder
302 * to achieve.
303 */
304 while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
305 cpu_relax();
306
307 /* enable local interrupts */ 295 /* enable local interrupts */
308 local_irq_enable(); 296 local_irq_enable();
309 297
@@ -740,8 +728,6 @@ do_rest:
740 * the targeted processor. 728 * the targeted processor.
741 */ 729 */
742 730
743 printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip);
744
745 atomic_set(&init_deasserted, 0); 731 atomic_set(&init_deasserted, 0);
746 732
747 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 733 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
@@ -791,9 +777,10 @@ do_rest:
791 schedule(); 777 schedule();
792 } 778 }
793 779
794 if (cpumask_test_cpu(cpu, cpu_callin_mask)) 780 if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
781 print_cpu_msr(&cpu_data(cpu));
795 pr_debug("CPU%d: has booted.\n", cpu); 782 pr_debug("CPU%d: has booted.\n", cpu);
796 else { 783 } else {
797 boot_error = 1; 784 boot_error = 1;
798 if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) 785 if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
799 == 0xA5A5A5A5) 786 == 0xA5A5A5A5)
@@ -847,7 +834,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
847 834
848 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || 835 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
849 !physid_isset(apicid, phys_cpu_present_map) || 836 !physid_isset(apicid, phys_cpu_present_map) ||
850 (!x2apic_mode && apicid >= 255)) { 837 !apic->apic_id_valid(apicid)) {
851 printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); 838 printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
852 return -EINVAL; 839 return -EINVAL;
853 } 840 }
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 051489082d59..ef59642ff1bf 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
195{ 195{
196 struct vm_area_struct *vma; 196 struct vm_area_struct *vma;
197 struct mm_struct *mm = current->mm; 197 struct mm_struct *mm = current->mm;
198 unsigned long addr = addr0; 198 unsigned long addr = addr0, start_addr;
199 199
200 /* requested length too big for entire address space */ 200 /* requested length too big for entire address space */
201 if (len > TASK_SIZE) 201 if (len > TASK_SIZE)
@@ -223,25 +223,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
223 mm->free_area_cache = mm->mmap_base; 223 mm->free_area_cache = mm->mmap_base;
224 } 224 }
225 225
226try_again:
226 /* either no address requested or can't fit in requested address hole */ 227 /* either no address requested or can't fit in requested address hole */
227 addr = mm->free_area_cache; 228 start_addr = addr = mm->free_area_cache;
228
229 /* make sure it can fit in the remaining address space */
230 if (addr > len) {
231 unsigned long tmp_addr = align_addr(addr - len, filp,
232 ALIGN_TOPDOWN);
233
234 vma = find_vma(mm, tmp_addr);
235 if (!vma || tmp_addr + len <= vma->vm_start)
236 /* remember the address as a hint for next time */
237 return mm->free_area_cache = tmp_addr;
238 }
239
240 if (mm->mmap_base < len)
241 goto bottomup;
242 229
243 addr = mm->mmap_base-len; 230 if (addr < len)
231 goto fail;
244 232
233 addr -= len;
245 do { 234 do {
246 addr = align_addr(addr, filp, ALIGN_TOPDOWN); 235 addr = align_addr(addr, filp, ALIGN_TOPDOWN);
247 236
@@ -263,6 +252,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
263 addr = vma->vm_start-len; 252 addr = vma->vm_start-len;
264 } while (len < vma->vm_start); 253 } while (len < vma->vm_start);
265 254
255fail:
256 /*
257 * if hint left us with no space for the requested
258 * mapping then try again:
259 */
260 if (start_addr != mm->mmap_base) {
261 mm->free_area_cache = mm->mmap_base;
262 mm->cached_hole_size = 0;
263 goto try_again;
264 }
265
266bottomup: 266bottomup:
267 /* 267 /*
268 * A failed mmap() very likely causes application failure, 268 * A failed mmap() very likely causes application failure,
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index dd5fbf4101fc..c6eba2b42673 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -57,9 +57,6 @@ EXPORT_SYMBOL(profile_pc);
57 */ 57 */
58static irqreturn_t timer_interrupt(int irq, void *dev_id) 58static irqreturn_t timer_interrupt(int irq, void *dev_id)
59{ 59{
60 /* Keep nmi watchdog up to date */
61 inc_irq_stat(irq0_irqs);
62
63 global_clock_event->event_handler(global_clock_event); 60 global_clock_event->event_handler(global_clock_event);
64 61
65 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ 62 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 482ec3af2067..ec61d4c1b93b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -54,6 +54,7 @@
54#include <asm/traps.h> 54#include <asm/traps.h>
55#include <asm/desc.h> 55#include <asm/desc.h>
56#include <asm/i387.h> 56#include <asm/i387.h>
57#include <asm/fpu-internal.h>
57#include <asm/mce.h> 58#include <asm/mce.h>
58 59
59#include <asm/mach_traps.h> 60#include <asm/mach_traps.h>
@@ -571,41 +572,18 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
571} 572}
572 573
573/* 574/*
574 * __math_state_restore assumes that cr0.TS is already clear and the
575 * fpu state is all ready for use. Used during context switch.
576 */
577void __math_state_restore(void)
578{
579 struct thread_info *thread = current_thread_info();
580 struct task_struct *tsk = thread->task;
581
582 /*
583 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
584 */
585 if (unlikely(restore_fpu_checking(tsk))) {
586 stts();
587 force_sig(SIGSEGV, tsk);
588 return;
589 }
590
591 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
592 tsk->fpu_counter++;
593}
594
595/*
596 * 'math_state_restore()' saves the current math information in the 575 * 'math_state_restore()' saves the current math information in the
597 * old math state array, and gets the new ones from the current task 576 * old math state array, and gets the new ones from the current task
598 * 577 *
599 * Careful.. There are problems with IBM-designed IRQ13 behaviour. 578 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
600 * Don't touch unless you *really* know how it works. 579 * Don't touch unless you *really* know how it works.
601 * 580 *
602 * Must be called with kernel preemption disabled (in this case, 581 * Must be called with kernel preemption disabled (eg with local
603 * local interrupts are disabled at the call-site in entry.S). 582 * local interrupts as in the case of do_device_not_available).
604 */ 583 */
605asmlinkage void math_state_restore(void) 584void math_state_restore(void)
606{ 585{
607 struct thread_info *thread = current_thread_info(); 586 struct task_struct *tsk = current;
608 struct task_struct *tsk = thread->task;
609 587
610 if (!tsk_used_math(tsk)) { 588 if (!tsk_used_math(tsk)) {
611 local_irq_enable(); 589 local_irq_enable();
@@ -622,9 +600,17 @@ asmlinkage void math_state_restore(void)
622 local_irq_disable(); 600 local_irq_disable();
623 } 601 }
624 602
625 clts(); /* Allow maths ops (or we recurse) */ 603 __thread_fpu_begin(tsk);
604 /*
605 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
606 */
607 if (unlikely(restore_fpu_checking(tsk))) {
608 __thread_fpu_end(tsk);
609 force_sig(SIGSEGV, tsk);
610 return;
611 }
626 612
627 __math_state_restore(); 613 tsk->fpu_counter++;
628} 614}
629EXPORT_SYMBOL_GPL(math_state_restore); 615EXPORT_SYMBOL_GPL(math_state_restore);
630 616
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a62c201c97ec..899a03f2d181 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -620,7 +620,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
620 620
621 if (cpu_khz) { 621 if (cpu_khz) {
622 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; 622 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
623 *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); 623 *offset = ns_now - mult_frac(tsc_now, *scale,
624 (1UL << CYC2NS_SCALE_FACTOR));
624 } 625 }
625 626
626 sched_clock_idle_wakeup_event(0); 627 sched_clock_idle_wakeup_event(0);
@@ -629,7 +630,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
629 630
630static unsigned long long cyc2ns_suspend; 631static unsigned long long cyc2ns_suspend;
631 632
632void save_sched_clock_state(void) 633void tsc_save_sched_clock_state(void)
633{ 634{
634 if (!sched_clock_stable) 635 if (!sched_clock_stable)
635 return; 636 return;
@@ -645,7 +646,7 @@ void save_sched_clock_state(void)
645 * that sched_clock() continues from the point where it was left off during 646 * that sched_clock() continues from the point where it was left off during
646 * suspend. 647 * suspend.
647 */ 648 */
648void restore_sched_clock_state(void) 649void tsc_restore_sched_clock_state(void)
649{ 650{
650 unsigned long long offset; 651 unsigned long long offset;
651 unsigned long flags; 652 unsigned long flags;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 9eba29b46cb7..fc25e60a5884 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -42,7 +42,7 @@ static __cpuinitdata int nr_warps;
42/* 42/*
43 * TSC-warp measurement loop running on both CPUs: 43 * TSC-warp measurement loop running on both CPUs:
44 */ 44 */
45static __cpuinit void check_tsc_warp(void) 45static __cpuinit void check_tsc_warp(unsigned int timeout)
46{ 46{
47 cycles_t start, now, prev, end; 47 cycles_t start, now, prev, end;
48 int i; 48 int i;
@@ -51,9 +51,9 @@ static __cpuinit void check_tsc_warp(void)
51 start = get_cycles(); 51 start = get_cycles();
52 rdtsc_barrier(); 52 rdtsc_barrier();
53 /* 53 /*
54 * The measurement runs for 20 msecs: 54 * The measurement runs for 'timeout' msecs:
55 */ 55 */
56 end = start + tsc_khz * 20ULL; 56 end = start + (cycles_t) tsc_khz * timeout;
57 now = start; 57 now = start;
58 58
59 for (i = 0; ; i++) { 59 for (i = 0; ; i++) {
@@ -99,6 +99,25 @@ static __cpuinit void check_tsc_warp(void)
99} 99}
100 100
101/* 101/*
102 * If the target CPU coming online doesn't have any of its core-siblings
103 * online, a timeout of 20msec will be used for the TSC-warp measurement
104 * loop. Otherwise a smaller timeout of 2msec will be used, as we have some
105 * information about this socket already (and this information grows as we
106 * have more and more logical-siblings in that socket).
107 *
108 * Ideally we should be able to skip the TSC sync check on the other
109 * core-siblings, if the first logical CPU in a socket passed the sync test.
110 * But as the TSC is per-logical CPU and can potentially be modified wrongly
111 * by the bios, TSC sync test for smaller duration should be able
112 * to catch such errors. Also this will catch the condition where all the
113 * cores in the socket doesn't get reset at the same time.
114 */
115static inline unsigned int loop_timeout(int cpu)
116{
117 return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20;
118}
119
120/*
102 * Source CPU calls into this - it waits for the freshly booted 121 * Source CPU calls into this - it waits for the freshly booted
103 * target CPU to arrive and then starts the measurement: 122 * target CPU to arrive and then starts the measurement:
104 */ 123 */
@@ -135,7 +154,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
135 */ 154 */
136 atomic_inc(&start_count); 155 atomic_inc(&start_count);
137 156
138 check_tsc_warp(); 157 check_tsc_warp(loop_timeout(cpu));
139 158
140 while (atomic_read(&stop_count) != cpus-1) 159 while (atomic_read(&stop_count) != cpus-1)
141 cpu_relax(); 160 cpu_relax();
@@ -183,7 +202,7 @@ void __cpuinit check_tsc_sync_target(void)
183 while (atomic_read(&start_count) != cpus) 202 while (atomic_read(&start_count) != cpus)
184 cpu_relax(); 203 cpu_relax();
185 204
186 check_tsc_warp(); 205 check_tsc_warp(loop_timeout(smp_processor_id()));
187 206
188 /* 207 /*
189 * Ok, we are done: 208 * Ok, we are done:
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index b466cab5ba15..328cb37bb827 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
172 spinlock_t *ptl; 172 spinlock_t *ptl;
173 int i; 173 int i;
174 174
175 down_write(&mm->mmap_sem);
175 pgd = pgd_offset(mm, 0xA0000); 176 pgd = pgd_offset(mm, 0xA0000);
176 if (pgd_none_or_clear_bad(pgd)) 177 if (pgd_none_or_clear_bad(pgd))
177 goto out; 178 goto out;
@@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
190 } 191 }
191 pte_unmap_unlock(pte, ptl); 192 pte_unmap_unlock(pte, ptl);
192out: 193out:
194 up_write(&mm->mmap_sem);
193 flush_tlb(); 195 flush_tlb();
194} 196}
195 197
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 947a06ccc673..e9f265fd79ae 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -91,6 +91,7 @@ struct x86_init_ops x86_init __initdata = {
91}; 91};
92 92
93struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 93struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
94 .early_percpu_clock_init = x86_init_noop,
94 .setup_percpu_clockev = setup_secondary_APIC_clock, 95 .setup_percpu_clockev = setup_secondary_APIC_clock,
95 .fixup_cpu_id = x86_default_fixup_cpu_id, 96 .fixup_cpu_id = x86_default_fixup_cpu_id,
96}; 97};
@@ -107,7 +108,9 @@ struct x86_platform_ops x86_platform = {
107 .is_untracked_pat_range = is_ISA_range, 108 .is_untracked_pat_range = is_ISA_range,
108 .nmi_init = default_nmi_init, 109 .nmi_init = default_nmi_init,
109 .get_nmi_reason = default_get_nmi_reason, 110 .get_nmi_reason = default_get_nmi_reason,
110 .i8042_detect = default_i8042_detect 111 .i8042_detect = default_i8042_detect,
112 .save_sched_clock_state = tsc_save_sched_clock_state,
113 .restore_sched_clock_state = tsc_restore_sched_clock_state,
111}; 114};
112 115
113EXPORT_SYMBOL_GPL(x86_platform); 116EXPORT_SYMBOL_GPL(x86_platform);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index a3911343976b..e62728e30b01 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -6,6 +6,7 @@
6#include <linux/bootmem.h> 6#include <linux/bootmem.h>
7#include <linux/compat.h> 7#include <linux/compat.h>
8#include <asm/i387.h> 8#include <asm/i387.h>
9#include <asm/fpu-internal.h>
9#ifdef CONFIG_IA32_EMULATION 10#ifdef CONFIG_IA32_EMULATION
10#include <asm/sigcontext32.h> 11#include <asm/sigcontext32.h>
11#endif 12#endif
@@ -47,7 +48,7 @@ void __sanitize_i387_state(struct task_struct *tsk)
47 if (!fx) 48 if (!fx)
48 return; 49 return;
49 50
50 BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); 51 BUG_ON(__thread_has_fpu(tsk));
51 52
52 xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; 53 xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
53 54
@@ -168,7 +169,7 @@ int save_i387_xstate(void __user *buf)
168 if (!used_math()) 169 if (!used_math())
169 return 0; 170 return 0;
170 171
171 if (task_thread_info(tsk)->status & TS_USEDFPU) { 172 if (user_has_fpu()) {
172 if (use_xsave()) 173 if (use_xsave())
173 err = xsave_user(buf); 174 err = xsave_user(buf);
174 else 175 else
@@ -176,8 +177,7 @@ int save_i387_xstate(void __user *buf)
176 177
177 if (err) 178 if (err)
178 return err; 179 return err;
179 task_thread_info(tsk)->status &= ~TS_USEDFPU; 180 user_fpu_end();
180 stts();
181 } else { 181 } else {
182 sanitize_i387_state(tsk); 182 sanitize_i387_state(tsk);
183 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, 183 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
@@ -292,10 +292,7 @@ int restore_i387_xstate(void __user *buf)
292 return err; 292 return err;
293 } 293 }
294 294
295 if (!(task_thread_info(current)->status & TS_USEDFPU)) { 295 user_fpu_begin();
296 clts();
297 task_thread_info(current)->status |= TS_USEDFPU;
298 }
299 if (use_xsave()) 296 if (use_xsave())
300 err = restore_user_xstate(buf); 297 err = restore_user_xstate(buf);
301 else 298 else