diff options
Diffstat (limited to 'arch/x86/kernel')
68 files changed, 2349 insertions, 1106 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5369059c07a9..532d2e090e6f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -69,6 +69,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o | |||
69 | obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o | 69 | obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o |
70 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o | 70 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o |
71 | obj-$(CONFIG_KPROBES) += kprobes.o | 71 | obj-$(CONFIG_KPROBES) += kprobes.o |
72 | obj-$(CONFIG_OPTPROBES) += kprobes-opt.o | ||
72 | obj-$(CONFIG_MODULES) += module.o | 73 | obj-$(CONFIG_MODULES) += module.o |
73 | obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o | 74 | obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o |
74 | obj-$(CONFIG_KGDB) += kgdb.o | 75 | obj-$(CONFIG_KGDB) += kgdb.o |
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index ce664f33ea8e..406ed77216d0 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c | |||
@@ -593,7 +593,7 @@ void __init acpi_set_irq_model_ioapic(void) | |||
593 | #ifdef CONFIG_ACPI_HOTPLUG_CPU | 593 | #ifdef CONFIG_ACPI_HOTPLUG_CPU |
594 | #include <acpi/processor.h> | 594 | #include <acpi/processor.h> |
595 | 595 | ||
596 | static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) | 596 | static void __cpuinitdata acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) |
597 | { | 597 | { |
598 | #ifdef CONFIG_ACPI_NUMA | 598 | #ifdef CONFIG_ACPI_NUMA |
599 | int nid; | 599 | int nid; |
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 8c3cdded6f2b..359b6899a36c 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c | |||
@@ -180,6 +180,7 @@ static struct apic apic_flat = { | |||
180 | .name = "flat", | 180 | .name = "flat", |
181 | .probe = flat_probe, | 181 | .probe = flat_probe, |
182 | .acpi_madt_oem_check = flat_acpi_madt_oem_check, | 182 | .acpi_madt_oem_check = flat_acpi_madt_oem_check, |
183 | .apic_id_valid = default_apic_id_valid, | ||
183 | .apic_id_registered = flat_apic_id_registered, | 184 | .apic_id_registered = flat_apic_id_registered, |
184 | 185 | ||
185 | .irq_delivery_mode = dest_LowestPrio, | 186 | .irq_delivery_mode = dest_LowestPrio, |
@@ -337,6 +338,7 @@ static struct apic apic_physflat = { | |||
337 | .name = "physical flat", | 338 | .name = "physical flat", |
338 | .probe = physflat_probe, | 339 | .probe = physflat_probe, |
339 | .acpi_madt_oem_check = physflat_acpi_madt_oem_check, | 340 | .acpi_madt_oem_check = physflat_acpi_madt_oem_check, |
341 | .apic_id_valid = default_apic_id_valid, | ||
340 | .apic_id_registered = flat_apic_id_registered, | 342 | .apic_id_registered = flat_apic_id_registered, |
341 | 343 | ||
342 | .irq_delivery_mode = dest_Fixed, | 344 | .irq_delivery_mode = dest_Fixed, |
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 775b82bc655c..634ae6cdd5c9 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c | |||
@@ -124,6 +124,7 @@ struct apic apic_noop = { | |||
124 | .probe = noop_probe, | 124 | .probe = noop_probe, |
125 | .acpi_madt_oem_check = NULL, | 125 | .acpi_madt_oem_check = NULL, |
126 | 126 | ||
127 | .apic_id_valid = default_apic_id_valid, | ||
127 | .apic_id_registered = noop_apic_id_registered, | 128 | .apic_id_registered = noop_apic_id_registered, |
128 | 129 | ||
129 | .irq_delivery_mode = dest_LowestPrio, | 130 | .irq_delivery_mode = dest_LowestPrio, |
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 09d3d8c1cd99..d9ea5f331ac5 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c | |||
@@ -56,6 +56,12 @@ static unsigned int read_xapic_id(void) | |||
56 | return get_apic_id(apic_read(APIC_ID)); | 56 | return get_apic_id(apic_read(APIC_ID)); |
57 | } | 57 | } |
58 | 58 | ||
59 | static int numachip_apic_id_valid(int apicid) | ||
60 | { | ||
61 | /* Trust what bootloader passes in MADT */ | ||
62 | return 1; | ||
63 | } | ||
64 | |||
59 | static int numachip_apic_id_registered(void) | 65 | static int numachip_apic_id_registered(void) |
60 | { | 66 | { |
61 | return physid_isset(read_xapic_id(), phys_cpu_present_map); | 67 | return physid_isset(read_xapic_id(), phys_cpu_present_map); |
@@ -223,10 +229,11 @@ static int __init numachip_system_init(void) | |||
223 | } | 229 | } |
224 | early_initcall(numachip_system_init); | 230 | early_initcall(numachip_system_init); |
225 | 231 | ||
226 | static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | 232 | static int __cpuinit numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) |
227 | { | 233 | { |
228 | if (!strncmp(oem_id, "NUMASC", 6)) { | 234 | if (!strncmp(oem_id, "NUMASC", 6)) { |
229 | numachip_system = 1; | 235 | numachip_system = 1; |
236 | setup_force_cpu_cap(X86_FEATURE_X2APIC); | ||
230 | return 1; | 237 | return 1; |
231 | } | 238 | } |
232 | 239 | ||
@@ -238,6 +245,7 @@ static struct apic apic_numachip __refconst = { | |||
238 | .name = "NumaConnect system", | 245 | .name = "NumaConnect system", |
239 | .probe = numachip_probe, | 246 | .probe = numachip_probe, |
240 | .acpi_madt_oem_check = numachip_acpi_madt_oem_check, | 247 | .acpi_madt_oem_check = numachip_acpi_madt_oem_check, |
248 | .apic_id_valid = numachip_apic_id_valid, | ||
241 | .apic_id_registered = numachip_apic_id_registered, | 249 | .apic_id_registered = numachip_apic_id_registered, |
242 | 250 | ||
243 | .irq_delivery_mode = dest_Fixed, | 251 | .irq_delivery_mode = dest_Fixed, |
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 521bead01137..0cdec7065aff 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c | |||
@@ -198,6 +198,7 @@ static struct apic apic_bigsmp = { | |||
198 | .name = "bigsmp", | 198 | .name = "bigsmp", |
199 | .probe = probe_bigsmp, | 199 | .probe = probe_bigsmp, |
200 | .acpi_madt_oem_check = NULL, | 200 | .acpi_madt_oem_check = NULL, |
201 | .apic_id_valid = default_apic_id_valid, | ||
201 | .apic_id_registered = bigsmp_apic_id_registered, | 202 | .apic_id_registered = bigsmp_apic_id_registered, |
202 | 203 | ||
203 | .irq_delivery_mode = dest_Fixed, | 204 | .irq_delivery_mode = dest_Fixed, |
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 5d513bc47b6b..e42d1d3b9134 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c | |||
@@ -625,6 +625,7 @@ static struct apic __refdata apic_es7000_cluster = { | |||
625 | .name = "es7000", | 625 | .name = "es7000", |
626 | .probe = probe_es7000, | 626 | .probe = probe_es7000, |
627 | .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster, | 627 | .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster, |
628 | .apic_id_valid = default_apic_id_valid, | ||
628 | .apic_id_registered = es7000_apic_id_registered, | 629 | .apic_id_registered = es7000_apic_id_registered, |
629 | 630 | ||
630 | .irq_delivery_mode = dest_LowestPrio, | 631 | .irq_delivery_mode = dest_LowestPrio, |
@@ -690,6 +691,7 @@ static struct apic __refdata apic_es7000 = { | |||
690 | .name = "es7000", | 691 | .name = "es7000", |
691 | .probe = probe_es7000, | 692 | .probe = probe_es7000, |
692 | .acpi_madt_oem_check = es7000_acpi_madt_oem_check, | 693 | .acpi_madt_oem_check = es7000_acpi_madt_oem_check, |
694 | .apic_id_valid = default_apic_id_valid, | ||
693 | .apic_id_registered = es7000_apic_id_registered, | 695 | .apic_id_registered = es7000_apic_id_registered, |
694 | 696 | ||
695 | .irq_delivery_mode = dest_Fixed, | 697 | .irq_delivery_mode = dest_Fixed, |
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index fb072754bc1d..6d10a66fc5a9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c | |||
@@ -3967,18 +3967,36 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi) | |||
3967 | static __init int bad_ioapic(unsigned long address) | 3967 | static __init int bad_ioapic(unsigned long address) |
3968 | { | 3968 | { |
3969 | if (nr_ioapics >= MAX_IO_APICS) { | 3969 | if (nr_ioapics >= MAX_IO_APICS) { |
3970 | printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded " | 3970 | pr_warn("WARNING: Max # of I/O APICs (%d) exceeded (found %d), skipping\n", |
3971 | "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); | 3971 | MAX_IO_APICS, nr_ioapics); |
3972 | return 1; | 3972 | return 1; |
3973 | } | 3973 | } |
3974 | if (!address) { | 3974 | if (!address) { |
3975 | printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address" | 3975 | pr_warn("WARNING: Bogus (zero) I/O APIC address found in table, skipping!\n"); |
3976 | " found in table, skipping!\n"); | ||
3977 | return 1; | 3976 | return 1; |
3978 | } | 3977 | } |
3979 | return 0; | 3978 | return 0; |
3980 | } | 3979 | } |
3981 | 3980 | ||
3981 | static __init int bad_ioapic_register(int idx) | ||
3982 | { | ||
3983 | union IO_APIC_reg_00 reg_00; | ||
3984 | union IO_APIC_reg_01 reg_01; | ||
3985 | union IO_APIC_reg_02 reg_02; | ||
3986 | |||
3987 | reg_00.raw = io_apic_read(idx, 0); | ||
3988 | reg_01.raw = io_apic_read(idx, 1); | ||
3989 | reg_02.raw = io_apic_read(idx, 2); | ||
3990 | |||
3991 | if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) { | ||
3992 | pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n", | ||
3993 | mpc_ioapic_addr(idx)); | ||
3994 | return 1; | ||
3995 | } | ||
3996 | |||
3997 | return 0; | ||
3998 | } | ||
3999 | |||
3982 | void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) | 4000 | void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) |
3983 | { | 4001 | { |
3984 | int idx = 0; | 4002 | int idx = 0; |
@@ -3995,6 +4013,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) | |||
3995 | ioapics[idx].mp_config.apicaddr = address; | 4013 | ioapics[idx].mp_config.apicaddr = address; |
3996 | 4014 | ||
3997 | set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); | 4015 | set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); |
4016 | |||
4017 | if (bad_ioapic_register(idx)) { | ||
4018 | clear_fixmap(FIX_IO_APIC_BASE_0 + idx); | ||
4019 | return; | ||
4020 | } | ||
4021 | |||
3998 | ioapics[idx].mp_config.apicid = io_apic_unique_id(id); | 4022 | ioapics[idx].mp_config.apicid = io_apic_unique_id(id); |
3999 | ioapics[idx].mp_config.apicver = io_apic_get_version(idx); | 4023 | ioapics[idx].mp_config.apicver = io_apic_get_version(idx); |
4000 | 4024 | ||
@@ -4015,10 +4039,10 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) | |||
4015 | if (gsi_cfg->gsi_end >= gsi_top) | 4039 | if (gsi_cfg->gsi_end >= gsi_top) |
4016 | gsi_top = gsi_cfg->gsi_end + 1; | 4040 | gsi_top = gsi_cfg->gsi_end + 1; |
4017 | 4041 | ||
4018 | printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " | 4042 | pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n", |
4019 | "GSI %d-%d\n", idx, mpc_ioapic_id(idx), | 4043 | idx, mpc_ioapic_id(idx), |
4020 | mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), | 4044 | mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), |
4021 | gsi_cfg->gsi_base, gsi_cfg->gsi_end); | 4045 | gsi_cfg->gsi_base, gsi_cfg->gsi_end); |
4022 | 4046 | ||
4023 | nr_ioapics++; | 4047 | nr_ioapics++; |
4024 | } | 4048 | } |
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index c4a61ca1349a..00d2422ca7c9 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c | |||
@@ -478,6 +478,7 @@ static struct apic __refdata apic_numaq = { | |||
478 | .name = "NUMAQ", | 478 | .name = "NUMAQ", |
479 | .probe = probe_numaq, | 479 | .probe = probe_numaq, |
480 | .acpi_madt_oem_check = NULL, | 480 | .acpi_madt_oem_check = NULL, |
481 | .apic_id_valid = default_apic_id_valid, | ||
481 | .apic_id_registered = numaq_apic_id_registered, | 482 | .apic_id_registered = numaq_apic_id_registered, |
482 | 483 | ||
483 | .irq_delivery_mode = dest_LowestPrio, | 484 | .irq_delivery_mode = dest_LowestPrio, |
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 0787bb3412f4..ff2c1b9aac4d 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c | |||
@@ -92,6 +92,7 @@ static struct apic apic_default = { | |||
92 | .name = "default", | 92 | .name = "default", |
93 | .probe = probe_default, | 93 | .probe = probe_default, |
94 | .acpi_madt_oem_check = NULL, | 94 | .acpi_madt_oem_check = NULL, |
95 | .apic_id_valid = default_apic_id_valid, | ||
95 | .apic_id_registered = default_apic_id_registered, | 96 | .apic_id_registered = default_apic_id_registered, |
96 | 97 | ||
97 | .irq_delivery_mode = dest_LowestPrio, | 98 | .irq_delivery_mode = dest_LowestPrio, |
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 19114423c58c..fea000b27f07 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c | |||
@@ -496,6 +496,7 @@ static struct apic apic_summit = { | |||
496 | .name = "summit", | 496 | .name = "summit", |
497 | .probe = probe_summit, | 497 | .probe = probe_summit, |
498 | .acpi_madt_oem_check = summit_acpi_madt_oem_check, | 498 | .acpi_madt_oem_check = summit_acpi_madt_oem_check, |
499 | .apic_id_valid = default_apic_id_valid, | ||
499 | .apic_id_registered = summit_apic_id_registered, | 500 | .apic_id_registered = summit_apic_id_registered, |
500 | 501 | ||
501 | .irq_delivery_mode = dest_LowestPrio, | 502 | .irq_delivery_mode = dest_LowestPrio, |
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 500795875827..9193713060a9 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c | |||
@@ -213,6 +213,7 @@ static struct apic apic_x2apic_cluster = { | |||
213 | .name = "cluster x2apic", | 213 | .name = "cluster x2apic", |
214 | .probe = x2apic_cluster_probe, | 214 | .probe = x2apic_cluster_probe, |
215 | .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, | 215 | .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, |
216 | .apic_id_valid = default_apic_id_valid, | ||
216 | .apic_id_registered = x2apic_apic_id_registered, | 217 | .apic_id_registered = x2apic_apic_id_registered, |
217 | 218 | ||
218 | .irq_delivery_mode = dest_LowestPrio, | 219 | .irq_delivery_mode = dest_LowestPrio, |
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index f5373dfde21e..bcd1db6eaca9 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c | |||
@@ -119,6 +119,7 @@ static struct apic apic_x2apic_phys = { | |||
119 | .name = "physical x2apic", | 119 | .name = "physical x2apic", |
120 | .probe = x2apic_phys_probe, | 120 | .probe = x2apic_phys_probe, |
121 | .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, | 121 | .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, |
122 | .apic_id_valid = default_apic_id_valid, | ||
122 | .apic_id_registered = x2apic_apic_id_registered, | 123 | .apic_id_registered = x2apic_apic_id_registered, |
123 | 124 | ||
124 | .irq_delivery_mode = dest_Fixed, | 125 | .irq_delivery_mode = dest_Fixed, |
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 79b05b88aa19..fc4771425852 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c | |||
@@ -351,6 +351,7 @@ static struct apic __refdata apic_x2apic_uv_x = { | |||
351 | .name = "UV large system", | 351 | .name = "UV large system", |
352 | .probe = uv_probe, | 352 | .probe = uv_probe, |
353 | .acpi_madt_oem_check = uv_acpi_madt_oem_check, | 353 | .acpi_madt_oem_check = uv_acpi_madt_oem_check, |
354 | .apic_id_valid = default_apic_id_valid, | ||
354 | .apic_id_registered = uv_apic_id_registered, | 355 | .apic_id_registered = uv_apic_id_registered, |
355 | 356 | ||
356 | .irq_delivery_mode = dest_Fixed, | 357 | .irq_delivery_mode = dest_Fixed, |
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index f76623cbe263..5d56931a15b3 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c | |||
@@ -1234,8 +1234,7 @@ static int suspend(int vetoable) | |||
1234 | struct apm_user *as; | 1234 | struct apm_user *as; |
1235 | 1235 | ||
1236 | dpm_suspend_start(PMSG_SUSPEND); | 1236 | dpm_suspend_start(PMSG_SUSPEND); |
1237 | 1237 | dpm_suspend_end(PMSG_SUSPEND); | |
1238 | dpm_suspend_noirq(PMSG_SUSPEND); | ||
1239 | 1238 | ||
1240 | local_irq_disable(); | 1239 | local_irq_disable(); |
1241 | syscore_suspend(); | 1240 | syscore_suspend(); |
@@ -1259,9 +1258,9 @@ static int suspend(int vetoable) | |||
1259 | syscore_resume(); | 1258 | syscore_resume(); |
1260 | local_irq_enable(); | 1259 | local_irq_enable(); |
1261 | 1260 | ||
1262 | dpm_resume_noirq(PMSG_RESUME); | 1261 | dpm_resume_start(PMSG_RESUME); |
1263 | |||
1264 | dpm_resume_end(PMSG_RESUME); | 1262 | dpm_resume_end(PMSG_RESUME); |
1263 | |||
1265 | queue_event(APM_NORMAL_RESUME, NULL); | 1264 | queue_event(APM_NORMAL_RESUME, NULL); |
1266 | spin_lock(&user_list_lock); | 1265 | spin_lock(&user_list_lock); |
1267 | for (as = user_list; as != NULL; as = as->next) { | 1266 | for (as = user_list; as != NULL; as = as->next) { |
@@ -1277,7 +1276,7 @@ static void standby(void) | |||
1277 | { | 1276 | { |
1278 | int err; | 1277 | int err; |
1279 | 1278 | ||
1280 | dpm_suspend_noirq(PMSG_SUSPEND); | 1279 | dpm_suspend_end(PMSG_SUSPEND); |
1281 | 1280 | ||
1282 | local_irq_disable(); | 1281 | local_irq_disable(); |
1283 | syscore_suspend(); | 1282 | syscore_suspend(); |
@@ -1291,7 +1290,7 @@ static void standby(void) | |||
1291 | syscore_resume(); | 1290 | syscore_resume(); |
1292 | local_irq_enable(); | 1291 | local_irq_enable(); |
1293 | 1292 | ||
1294 | dpm_resume_noirq(PMSG_RESUME); | 1293 | dpm_resume_start(PMSG_RESUME); |
1295 | } | 1294 | } |
1296 | 1295 | ||
1297 | static apm_event_t get_event(void) | 1296 | static apm_event_t get_event(void) |
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 25f24dccdcfa..6ab6aa2fdfdd 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
@@ -16,6 +16,7 @@ obj-y := intel_cacheinfo.o scattered.o topology.o | |||
16 | obj-y += proc.o capflags.o powerflags.o common.o | 16 | obj-y += proc.o capflags.o powerflags.o common.o |
17 | obj-y += vmware.o hypervisor.o sched.o mshyperv.o | 17 | obj-y += vmware.o hypervisor.o sched.o mshyperv.o |
18 | obj-y += rdrand.o | 18 | obj-y += rdrand.o |
19 | obj-y += match.o | ||
19 | 20 | ||
20 | obj-$(CONFIG_X86_32) += bugs.o | 21 | obj-$(CONFIG_X86_32) += bugs.o |
21 | obj-$(CONFIG_X86_64) += bugs_64.o | 22 | obj-$(CONFIG_X86_64) += bugs_64.o |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f4773f4aae35..0a44b90602b0 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | 6 | ||
7 | #include <linux/io.h> | 7 | #include <linux/io.h> |
8 | #include <linux/sched.h> | ||
8 | #include <asm/processor.h> | 9 | #include <asm/processor.h> |
9 | #include <asm/apic.h> | 10 | #include <asm/apic.h> |
10 | #include <asm/cpu.h> | 11 | #include <asm/cpu.h> |
@@ -456,6 +457,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) | |||
456 | if (c->x86_power & (1 << 8)) { | 457 | if (c->x86_power & (1 << 8)) { |
457 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | 458 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); |
458 | set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); | 459 | set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); |
460 | if (!check_tsc_unstable()) | ||
461 | sched_clock_stable = 1; | ||
459 | } | 462 | } |
460 | 463 | ||
461 | #ifdef CONFIG_X86_64 | 464 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d43cad74f166..e49477444fff 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <asm/archrandom.h> | 18 | #include <asm/archrandom.h> |
19 | #include <asm/hypervisor.h> | 19 | #include <asm/hypervisor.h> |
20 | #include <asm/processor.h> | 20 | #include <asm/processor.h> |
21 | #include <asm/debugreg.h> | ||
21 | #include <asm/sections.h> | 22 | #include <asm/sections.h> |
22 | #include <linux/topology.h> | 23 | #include <linux/topology.h> |
23 | #include <linux/cpumask.h> | 24 | #include <linux/cpumask.h> |
@@ -28,6 +29,7 @@ | |||
28 | #include <asm/apic.h> | 29 | #include <asm/apic.h> |
29 | #include <asm/desc.h> | 30 | #include <asm/desc.h> |
30 | #include <asm/i387.h> | 31 | #include <asm/i387.h> |
32 | #include <asm/fpu-internal.h> | ||
31 | #include <asm/mtrr.h> | 33 | #include <asm/mtrr.h> |
32 | #include <linux/numa.h> | 34 | #include <linux/numa.h> |
33 | #include <asm/asm.h> | 35 | #include <asm/asm.h> |
@@ -933,7 +935,7 @@ static const struct msr_range msr_range_array[] __cpuinitconst = { | |||
933 | { 0xc0011000, 0xc001103b}, | 935 | { 0xc0011000, 0xc001103b}, |
934 | }; | 936 | }; |
935 | 937 | ||
936 | static void __cpuinit print_cpu_msr(void) | 938 | static void __cpuinit __print_cpu_msr(void) |
937 | { | 939 | { |
938 | unsigned index_min, index_max; | 940 | unsigned index_min, index_max; |
939 | unsigned index; | 941 | unsigned index; |
@@ -997,13 +999,13 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) | |||
997 | else | 999 | else |
998 | printk(KERN_CONT "\n"); | 1000 | printk(KERN_CONT "\n"); |
999 | 1001 | ||
1000 | #ifdef CONFIG_SMP | 1002 | __print_cpu_msr(); |
1003 | } | ||
1004 | |||
1005 | void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c) | ||
1006 | { | ||
1001 | if (c->cpu_index < show_msr) | 1007 | if (c->cpu_index < show_msr) |
1002 | print_cpu_msr(); | 1008 | __print_cpu_msr(); |
1003 | #else | ||
1004 | if (show_msr) | ||
1005 | print_cpu_msr(); | ||
1006 | #endif | ||
1007 | } | 1009 | } |
1008 | 1010 | ||
1009 | static __init int setup_disablecpuid(char *arg) | 1011 | static __init int setup_disablecpuid(char *arg) |
@@ -1044,6 +1046,8 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = | |||
1044 | 1046 | ||
1045 | DEFINE_PER_CPU(unsigned int, irq_count) = -1; | 1047 | DEFINE_PER_CPU(unsigned int, irq_count) = -1; |
1046 | 1048 | ||
1049 | DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); | ||
1050 | |||
1047 | /* | 1051 | /* |
1048 | * Special IST stacks which the CPU switches to when it calls | 1052 | * Special IST stacks which the CPU switches to when it calls |
1049 | * an IST-marked descriptor entry. Up to 7 stacks (hardware | 1053 | * an IST-marked descriptor entry. Up to 7 stacks (hardware |
@@ -1111,6 +1115,7 @@ void debug_stack_reset(void) | |||
1111 | 1115 | ||
1112 | DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; | 1116 | DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; |
1113 | EXPORT_PER_CPU_SYMBOL(current_task); | 1117 | EXPORT_PER_CPU_SYMBOL(current_task); |
1118 | DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); | ||
1114 | 1119 | ||
1115 | #ifdef CONFIG_CC_STACKPROTECTOR | 1120 | #ifdef CONFIG_CC_STACKPROTECTOR |
1116 | DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); | 1121 | DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); |
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 6b45e5e7a901..73d08ed98a64 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c | |||
@@ -326,8 +326,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb) | |||
326 | l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; | 326 | l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; |
327 | } | 327 | } |
328 | 328 | ||
329 | static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, | 329 | static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) |
330 | int index) | ||
331 | { | 330 | { |
332 | int node; | 331 | int node; |
333 | 332 | ||
@@ -725,14 +724,16 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); | |||
725 | #define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) | 724 | #define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) |
726 | 725 | ||
727 | #ifdef CONFIG_SMP | 726 | #ifdef CONFIG_SMP |
728 | static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) | 727 | |
728 | static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) | ||
729 | { | 729 | { |
730 | struct _cpuid4_info *this_leaf, *sibling_leaf; | 730 | struct _cpuid4_info *this_leaf; |
731 | unsigned long num_threads_sharing; | 731 | int ret, i, sibling; |
732 | int index_msb, i, sibling; | ||
733 | struct cpuinfo_x86 *c = &cpu_data(cpu); | 732 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
734 | 733 | ||
735 | if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { | 734 | ret = 0; |
735 | if (index == 3) { | ||
736 | ret = 1; | ||
736 | for_each_cpu(i, cpu_llc_shared_mask(cpu)) { | 737 | for_each_cpu(i, cpu_llc_shared_mask(cpu)) { |
737 | if (!per_cpu(ici_cpuid4_info, i)) | 738 | if (!per_cpu(ici_cpuid4_info, i)) |
738 | continue; | 739 | continue; |
@@ -743,8 +744,35 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) | |||
743 | set_bit(sibling, this_leaf->shared_cpu_map); | 744 | set_bit(sibling, this_leaf->shared_cpu_map); |
744 | } | 745 | } |
745 | } | 746 | } |
746 | return; | 747 | } else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) { |
748 | ret = 1; | ||
749 | for_each_cpu(i, cpu_sibling_mask(cpu)) { | ||
750 | if (!per_cpu(ici_cpuid4_info, i)) | ||
751 | continue; | ||
752 | this_leaf = CPUID4_INFO_IDX(i, index); | ||
753 | for_each_cpu(sibling, cpu_sibling_mask(cpu)) { | ||
754 | if (!cpu_online(sibling)) | ||
755 | continue; | ||
756 | set_bit(sibling, this_leaf->shared_cpu_map); | ||
757 | } | ||
758 | } | ||
747 | } | 759 | } |
760 | |||
761 | return ret; | ||
762 | } | ||
763 | |||
764 | static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) | ||
765 | { | ||
766 | struct _cpuid4_info *this_leaf, *sibling_leaf; | ||
767 | unsigned long num_threads_sharing; | ||
768 | int index_msb, i; | ||
769 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
770 | |||
771 | if (c->x86_vendor == X86_VENDOR_AMD) { | ||
772 | if (cache_shared_amd_cpu_map_setup(cpu, index)) | ||
773 | return; | ||
774 | } | ||
775 | |||
748 | this_leaf = CPUID4_INFO_IDX(cpu, index); | 776 | this_leaf = CPUID4_INFO_IDX(cpu, index); |
749 | num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; | 777 | num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; |
750 | 778 | ||
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c new file mode 100644 index 000000000000..5502b289341b --- /dev/null +++ b/arch/x86/kernel/cpu/match.c | |||
@@ -0,0 +1,91 @@ | |||
1 | #include <asm/cpu_device_id.h> | ||
2 | #include <asm/processor.h> | ||
3 | #include <linux/cpu.h> | ||
4 | #include <linux/module.h> | ||
5 | #include <linux/slab.h> | ||
6 | |||
7 | /** | ||
8 | * x86_match_cpu - match current CPU again an array of x86_cpu_ids | ||
9 | * @match: Pointer to array of x86_cpu_ids. Last entry terminated with | ||
10 | * {}. | ||
11 | * | ||
12 | * Return the entry if the current CPU matches the entries in the | ||
13 | * passed x86_cpu_id match table. Otherwise NULL. The match table | ||
14 | * contains vendor (X86_VENDOR_*), family, model and feature bits or | ||
15 | * respective wildcard entries. | ||
16 | * | ||
17 | * A typical table entry would be to match a specific CPU | ||
18 | * { X86_VENDOR_INTEL, 6, 0x12 } | ||
19 | * or to match a specific CPU feature | ||
20 | * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) } | ||
21 | * | ||
22 | * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, | ||
23 | * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) | ||
24 | * | ||
25 | * Arrays used to match for this should also be declared using | ||
26 | * MODULE_DEVICE_TABLE(x86_cpu, ...) | ||
27 | * | ||
28 | * This always matches against the boot cpu, assuming models and features are | ||
29 | * consistent over all CPUs. | ||
30 | */ | ||
31 | const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) | ||
32 | { | ||
33 | const struct x86_cpu_id *m; | ||
34 | struct cpuinfo_x86 *c = &boot_cpu_data; | ||
35 | |||
36 | for (m = match; m->vendor | m->family | m->model | m->feature; m++) { | ||
37 | if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) | ||
38 | continue; | ||
39 | if (m->family != X86_FAMILY_ANY && c->x86 != m->family) | ||
40 | continue; | ||
41 | if (m->model != X86_MODEL_ANY && c->x86_model != m->model) | ||
42 | continue; | ||
43 | if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) | ||
44 | continue; | ||
45 | return m; | ||
46 | } | ||
47 | return NULL; | ||
48 | } | ||
49 | EXPORT_SYMBOL(x86_match_cpu); | ||
50 | |||
51 | ssize_t arch_print_cpu_modalias(struct device *dev, | ||
52 | struct device_attribute *attr, | ||
53 | char *bufptr) | ||
54 | { | ||
55 | int size = PAGE_SIZE; | ||
56 | int i, n; | ||
57 | char *buf = bufptr; | ||
58 | |||
59 | n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:" | ||
60 | "model:%04X:feature:", | ||
61 | boot_cpu_data.x86_vendor, | ||
62 | boot_cpu_data.x86, | ||
63 | boot_cpu_data.x86_model); | ||
64 | size -= n; | ||
65 | buf += n; | ||
66 | size -= 1; | ||
67 | for (i = 0; i < NCAPINTS*32; i++) { | ||
68 | if (boot_cpu_has(i)) { | ||
69 | n = snprintf(buf, size, ",%04X", i); | ||
70 | if (n >= size) { | ||
71 | WARN(1, "x86 features overflow page\n"); | ||
72 | break; | ||
73 | } | ||
74 | size -= n; | ||
75 | buf += n; | ||
76 | } | ||
77 | } | ||
78 | *buf++ = '\n'; | ||
79 | return buf - bufptr; | ||
80 | } | ||
81 | |||
82 | int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env) | ||
83 | { | ||
84 | char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL); | ||
85 | if (buf) { | ||
86 | arch_print_cpu_modalias(NULL, NULL, buf); | ||
87 | add_uevent_var(env, "MODALIAS=%s", buf); | ||
88 | kfree(buf); | ||
89 | } | ||
90 | return 0; | ||
91 | } | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 7395d5f4272d..0c82091b1652 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c | |||
@@ -54,7 +54,14 @@ static struct severity { | |||
54 | #define MASK(x, y) .mask = x, .result = y | 54 | #define MASK(x, y) .mask = x, .result = y |
55 | #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) | 55 | #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) |
56 | #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) | 56 | #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) |
57 | #define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) | ||
57 | #define MCACOD 0xffff | 58 | #define MCACOD 0xffff |
59 | /* Architecturally defined codes from SDM Vol. 3B Chapter 15 */ | ||
60 | #define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */ | ||
61 | #define MCACOD_SCRUBMSK 0xfff0 | ||
62 | #define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */ | ||
63 | #define MCACOD_DATA 0x0134 /* Data Load */ | ||
64 | #define MCACOD_INSTR 0x0150 /* Instruction Fetch */ | ||
58 | 65 | ||
59 | MCESEV( | 66 | MCESEV( |
60 | NO, "Invalid", | 67 | NO, "Invalid", |
@@ -102,11 +109,24 @@ static struct severity { | |||
102 | SER, BITCLR(MCI_STATUS_S) | 109 | SER, BITCLR(MCI_STATUS_S) |
103 | ), | 110 | ), |
104 | 111 | ||
105 | /* AR add known MCACODs here */ | ||
106 | MCESEV( | 112 | MCESEV( |
107 | PANIC, "Action required with lost events", | 113 | PANIC, "Action required with lost events", |
108 | SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) | 114 | SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) |
109 | ), | 115 | ), |
116 | |||
117 | /* known AR MCACODs: */ | ||
118 | #ifdef CONFIG_MEMORY_FAILURE | ||
119 | MCESEV( | ||
120 | KEEP, "HT thread notices Action required: data load error", | ||
121 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), | ||
122 | MCGMASK(MCG_STATUS_EIPV, 0) | ||
123 | ), | ||
124 | MCESEV( | ||
125 | AR, "Action required: data load error", | ||
126 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), | ||
127 | USER | ||
128 | ), | ||
129 | #endif | ||
110 | MCESEV( | 130 | MCESEV( |
111 | PANIC, "Action required: unknown MCACOD", | 131 | PANIC, "Action required: unknown MCACOD", |
112 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) | 132 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) |
@@ -115,11 +135,11 @@ static struct severity { | |||
115 | /* known AO MCACODs: */ | 135 | /* known AO MCACODs: */ |
116 | MCESEV( | 136 | MCESEV( |
117 | AO, "Action optional: memory scrubbing error", | 137 | AO, "Action optional: memory scrubbing error", |
118 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0) | 138 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) |
119 | ), | 139 | ), |
120 | MCESEV( | 140 | MCESEV( |
121 | AO, "Action optional: last level cache writeback error", | 141 | AO, "Action optional: last level cache writeback error", |
122 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a) | 142 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) |
123 | ), | 143 | ), |
124 | MCESEV( | 144 | MCESEV( |
125 | SOME, "Action optional: unknown MCACOD", | 145 | SOME, "Action optional: unknown MCACOD", |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a11ae2e9e91..d086a09c087d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -191,7 +191,7 @@ static void drain_mcelog_buffer(void) | |||
191 | { | 191 | { |
192 | unsigned int next, i, prev = 0; | 192 | unsigned int next, i, prev = 0; |
193 | 193 | ||
194 | next = rcu_dereference_check_mce(mcelog.next); | 194 | next = ACCESS_ONCE(mcelog.next); |
195 | 195 | ||
196 | do { | 196 | do { |
197 | struct mce *m; | 197 | struct mce *m; |
@@ -540,6 +540,27 @@ static void mce_report_event(struct pt_regs *regs) | |||
540 | irq_work_queue(&__get_cpu_var(mce_irq_work)); | 540 | irq_work_queue(&__get_cpu_var(mce_irq_work)); |
541 | } | 541 | } |
542 | 542 | ||
543 | /* | ||
544 | * Read ADDR and MISC registers. | ||
545 | */ | ||
546 | static void mce_read_aux(struct mce *m, int i) | ||
547 | { | ||
548 | if (m->status & MCI_STATUS_MISCV) | ||
549 | m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); | ||
550 | if (m->status & MCI_STATUS_ADDRV) { | ||
551 | m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); | ||
552 | |||
553 | /* | ||
554 | * Mask the reported address by the reported granularity. | ||
555 | */ | ||
556 | if (mce_ser && (m->status & MCI_STATUS_MISCV)) { | ||
557 | u8 shift = MCI_MISC_ADDR_LSB(m->misc); | ||
558 | m->addr >>= shift; | ||
559 | m->addr <<= shift; | ||
560 | } | ||
561 | } | ||
562 | } | ||
563 | |||
543 | DEFINE_PER_CPU(unsigned, mce_poll_count); | 564 | DEFINE_PER_CPU(unsigned, mce_poll_count); |
544 | 565 | ||
545 | /* | 566 | /* |
@@ -590,10 +611,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
590 | (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) | 611 | (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) |
591 | continue; | 612 | continue; |
592 | 613 | ||
593 | if (m.status & MCI_STATUS_MISCV) | 614 | mce_read_aux(&m, i); |
594 | m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); | ||
595 | if (m.status & MCI_STATUS_ADDRV) | ||
596 | m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); | ||
597 | 615 | ||
598 | if (!(flags & MCP_TIMESTAMP)) | 616 | if (!(flags & MCP_TIMESTAMP)) |
599 | m.tsc = 0; | 617 | m.tsc = 0; |
@@ -917,6 +935,49 @@ static void mce_clear_state(unsigned long *toclear) | |||
917 | } | 935 | } |
918 | 936 | ||
919 | /* | 937 | /* |
938 | * Need to save faulting physical address associated with a process | ||
939 | * in the machine check handler some place where we can grab it back | ||
940 | * later in mce_notify_process() | ||
941 | */ | ||
942 | #define MCE_INFO_MAX 16 | ||
943 | |||
944 | struct mce_info { | ||
945 | atomic_t inuse; | ||
946 | struct task_struct *t; | ||
947 | __u64 paddr; | ||
948 | } mce_info[MCE_INFO_MAX]; | ||
949 | |||
950 | static void mce_save_info(__u64 addr) | ||
951 | { | ||
952 | struct mce_info *mi; | ||
953 | |||
954 | for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { | ||
955 | if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { | ||
956 | mi->t = current; | ||
957 | mi->paddr = addr; | ||
958 | return; | ||
959 | } | ||
960 | } | ||
961 | |||
962 | mce_panic("Too many concurrent recoverable errors", NULL, NULL); | ||
963 | } | ||
964 | |||
965 | static struct mce_info *mce_find_info(void) | ||
966 | { | ||
967 | struct mce_info *mi; | ||
968 | |||
969 | for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) | ||
970 | if (atomic_read(&mi->inuse) && mi->t == current) | ||
971 | return mi; | ||
972 | return NULL; | ||
973 | } | ||
974 | |||
975 | static void mce_clear_info(struct mce_info *mi) | ||
976 | { | ||
977 | atomic_set(&mi->inuse, 0); | ||
978 | } | ||
979 | |||
980 | /* | ||
920 | * The actual machine check handler. This only handles real | 981 | * The actual machine check handler. This only handles real |
921 | * exceptions when something got corrupted coming in through int 18. | 982 | * exceptions when something got corrupted coming in through int 18. |
922 | * | 983 | * |
@@ -969,7 +1030,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
969 | barrier(); | 1030 | barrier(); |
970 | 1031 | ||
971 | /* | 1032 | /* |
972 | * When no restart IP must always kill or panic. | 1033 | * When no restart IP might need to kill or panic. |
1034 | * Assume the worst for now, but if we find the | ||
1035 | * severity is MCE_AR_SEVERITY we have other options. | ||
973 | */ | 1036 | */ |
974 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) | 1037 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) |
975 | kill_it = 1; | 1038 | kill_it = 1; |
@@ -1023,16 +1086,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1023 | continue; | 1086 | continue; |
1024 | } | 1087 | } |
1025 | 1088 | ||
1026 | /* | 1089 | mce_read_aux(&m, i); |
1027 | * Kill on action required. | ||
1028 | */ | ||
1029 | if (severity == MCE_AR_SEVERITY) | ||
1030 | kill_it = 1; | ||
1031 | |||
1032 | if (m.status & MCI_STATUS_MISCV) | ||
1033 | m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); | ||
1034 | if (m.status & MCI_STATUS_ADDRV) | ||
1035 | m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); | ||
1036 | 1090 | ||
1037 | /* | 1091 | /* |
1038 | * Action optional error. Queue address for later processing. | 1092 | * Action optional error. Queue address for later processing. |
@@ -1052,6 +1106,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1052 | } | 1106 | } |
1053 | } | 1107 | } |
1054 | 1108 | ||
1109 | /* mce_clear_state will clear *final, save locally for use later */ | ||
1110 | m = *final; | ||
1111 | |||
1055 | if (!no_way_out) | 1112 | if (!no_way_out) |
1056 | mce_clear_state(toclear); | 1113 | mce_clear_state(toclear); |
1057 | 1114 | ||
@@ -1063,27 +1120,22 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1063 | no_way_out = worst >= MCE_PANIC_SEVERITY; | 1120 | no_way_out = worst >= MCE_PANIC_SEVERITY; |
1064 | 1121 | ||
1065 | /* | 1122 | /* |
1066 | * If we have decided that we just CAN'T continue, and the user | 1123 | * At insane "tolerant" levels we take no action. Otherwise |
1067 | * has not set tolerant to an insane level, give up and die. | 1124 | * we only die if we have no other choice. For less serious |
1068 | * | 1125 | * issues we try to recover, or limit damage to the current |
1069 | * This is mainly used in the case when the system doesn't | 1126 | * process. |
1070 | * support MCE broadcasting or it has been disabled. | ||
1071 | */ | ||
1072 | if (no_way_out && tolerant < 3) | ||
1073 | mce_panic("Fatal machine check on current CPU", final, msg); | ||
1074 | |||
1075 | /* | ||
1076 | * If the error seems to be unrecoverable, something should be | ||
1077 | * done. Try to kill as little as possible. If we can kill just | ||
1078 | * one task, do that. If the user has set the tolerance very | ||
1079 | * high, don't try to do anything at all. | ||
1080 | */ | 1127 | */ |
1081 | 1128 | if (tolerant < 3) { | |
1082 | if (kill_it && tolerant < 3) | 1129 | if (no_way_out) |
1083 | force_sig(SIGBUS, current); | 1130 | mce_panic("Fatal machine check on current CPU", &m, msg); |
1084 | 1131 | if (worst == MCE_AR_SEVERITY) { | |
1085 | /* notify userspace ASAP */ | 1132 | /* schedule action before return to userland */ |
1086 | set_thread_flag(TIF_MCE_NOTIFY); | 1133 | mce_save_info(m.addr); |
1134 | set_thread_flag(TIF_MCE_NOTIFY); | ||
1135 | } else if (kill_it) { | ||
1136 | force_sig(SIGBUS, current); | ||
1137 | } | ||
1138 | } | ||
1087 | 1139 | ||
1088 | if (worst > 0) | 1140 | if (worst > 0) |
1089 | mce_report_event(regs); | 1141 | mce_report_event(regs); |
@@ -1094,34 +1146,57 @@ out: | |||
1094 | } | 1146 | } |
1095 | EXPORT_SYMBOL_GPL(do_machine_check); | 1147 | EXPORT_SYMBOL_GPL(do_machine_check); |
1096 | 1148 | ||
1097 | /* dummy to break dependency. actual code is in mm/memory-failure.c */ | 1149 | #ifndef CONFIG_MEMORY_FAILURE |
1098 | void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) | 1150 | int memory_failure(unsigned long pfn, int vector, int flags) |
1099 | { | 1151 | { |
1100 | printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); | 1152 | /* mce_severity() should not hand us an ACTION_REQUIRED error */ |
1153 | BUG_ON(flags & MF_ACTION_REQUIRED); | ||
1154 | printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" | ||
1155 | "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); | ||
1156 | |||
1157 | return 0; | ||
1101 | } | 1158 | } |
1159 | #endif | ||
1102 | 1160 | ||
1103 | /* | 1161 | /* |
1104 | * Called after mce notification in process context. This code | 1162 | * Called in process context that interrupted by MCE and marked with |
1105 | * is allowed to sleep. Call the high level VM handler to process | 1163 | * TIF_MCE_NOTIFY, just before returning to erroneous userland. |
1106 | * any corrupted pages. | 1164 | * This code is allowed to sleep. |
1107 | * Assume that the work queue code only calls this one at a time | 1165 | * Attempt possible recovery such as calling the high level VM handler to |
1108 | * per CPU. | 1166 | * process any corrupted pages, and kill/signal current process if required. |
1109 | * Note we don't disable preemption, so this code might run on the wrong | 1167 | * Action required errors are handled here. |
1110 | * CPU. In this case the event is picked up by the scheduled work queue. | ||
1111 | * This is merely a fast path to expedite processing in some common | ||
1112 | * cases. | ||
1113 | */ | 1168 | */ |
1114 | void mce_notify_process(void) | 1169 | void mce_notify_process(void) |
1115 | { | 1170 | { |
1116 | unsigned long pfn; | 1171 | unsigned long pfn; |
1117 | mce_notify_irq(); | 1172 | struct mce_info *mi = mce_find_info(); |
1118 | while (mce_ring_get(&pfn)) | 1173 | |
1119 | memory_failure(pfn, MCE_VECTOR); | 1174 | if (!mi) |
1175 | mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); | ||
1176 | pfn = mi->paddr >> PAGE_SHIFT; | ||
1177 | |||
1178 | clear_thread_flag(TIF_MCE_NOTIFY); | ||
1179 | |||
1180 | pr_err("Uncorrected hardware memory error in user-access at %llx", | ||
1181 | mi->paddr); | ||
1182 | if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { | ||
1183 | pr_err("Memory error not recovered"); | ||
1184 | force_sig(SIGBUS, current); | ||
1185 | } | ||
1186 | mce_clear_info(mi); | ||
1120 | } | 1187 | } |
1121 | 1188 | ||
1189 | /* | ||
1190 | * Action optional processing happens here (picking up | ||
1191 | * from the list of faulting pages that do_machine_check() | ||
1192 | * placed into the "ring"). | ||
1193 | */ | ||
1122 | static void mce_process_work(struct work_struct *dummy) | 1194 | static void mce_process_work(struct work_struct *dummy) |
1123 | { | 1195 | { |
1124 | mce_notify_process(); | 1196 | unsigned long pfn; |
1197 | |||
1198 | while (mce_ring_get(&pfn)) | ||
1199 | memory_failure(pfn, MCE_VECTOR, 0); | ||
1125 | } | 1200 | } |
1126 | 1201 | ||
1127 | #ifdef CONFIG_X86_MCE_INTEL | 1202 | #ifdef CONFIG_X86_MCE_INTEL |
@@ -1211,8 +1286,6 @@ int mce_notify_irq(void) | |||
1211 | /* Not more than two messages every minute */ | 1286 | /* Not more than two messages every minute */ |
1212 | static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); | 1287 | static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); |
1213 | 1288 | ||
1214 | clear_thread_flag(TIF_MCE_NOTIFY); | ||
1215 | |||
1216 | if (test_and_clear_bit(0, &mce_need_notify)) { | 1289 | if (test_and_clear_bit(0, &mce_need_notify)) { |
1217 | /* wake processes polling /dev/mcelog */ | 1290 | /* wake processes polling /dev/mcelog */ |
1218 | wake_up_interruptible(&mce_chrdev_wait); | 1291 | wake_up_interruptible(&mce_chrdev_wait); |
@@ -1541,6 +1614,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize) | |||
1541 | /* Error or no more MCE record */ | 1614 | /* Error or no more MCE record */ |
1542 | if (rc <= 0) { | 1615 | if (rc <= 0) { |
1543 | mce_apei_read_done = 1; | 1616 | mce_apei_read_done = 1; |
1617 | /* | ||
1618 | * When ERST is disabled, mce_chrdev_read() should return | ||
1619 | * "no record" instead of "no device." | ||
1620 | */ | ||
1621 | if (rc == -ENODEV) | ||
1622 | return 0; | ||
1544 | return rc; | 1623 | return rc; |
1545 | } | 1624 | } |
1546 | rc = -EFAULT; | 1625 | rc = -EFAULT; |
@@ -1859,7 +1938,7 @@ static struct bus_type mce_subsys = { | |||
1859 | .dev_name = "machinecheck", | 1938 | .dev_name = "machinecheck", |
1860 | }; | 1939 | }; |
1861 | 1940 | ||
1862 | struct device *mce_device[CONFIG_NR_CPUS]; | 1941 | DEFINE_PER_CPU(struct device *, mce_device); |
1863 | 1942 | ||
1864 | __cpuinitdata | 1943 | __cpuinitdata |
1865 | void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); | 1944 | void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); |
@@ -2038,7 +2117,7 @@ static __cpuinit int mce_device_create(unsigned int cpu) | |||
2038 | goto error2; | 2117 | goto error2; |
2039 | } | 2118 | } |
2040 | cpumask_set_cpu(cpu, mce_device_initialized); | 2119 | cpumask_set_cpu(cpu, mce_device_initialized); |
2041 | mce_device[cpu] = dev; | 2120 | per_cpu(mce_device, cpu) = dev; |
2042 | 2121 | ||
2043 | return 0; | 2122 | return 0; |
2044 | error2: | 2123 | error2: |
@@ -2055,7 +2134,7 @@ error: | |||
2055 | 2134 | ||
2056 | static __cpuinit void mce_device_remove(unsigned int cpu) | 2135 | static __cpuinit void mce_device_remove(unsigned int cpu) |
2057 | { | 2136 | { |
2058 | struct device *dev = mce_device[cpu]; | 2137 | struct device *dev = per_cpu(mce_device, cpu); |
2059 | int i; | 2138 | int i; |
2060 | 2139 | ||
2061 | if (!cpumask_test_cpu(cpu, mce_device_initialized)) | 2140 | if (!cpumask_test_cpu(cpu, mce_device_initialized)) |
@@ -2069,7 +2148,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu) | |||
2069 | 2148 | ||
2070 | device_unregister(dev); | 2149 | device_unregister(dev); |
2071 | cpumask_clear_cpu(cpu, mce_device_initialized); | 2150 | cpumask_clear_cpu(cpu, mce_device_initialized); |
2072 | mce_device[cpu] = NULL; | 2151 | per_cpu(mce_device, cpu) = NULL; |
2073 | } | 2152 | } |
2074 | 2153 | ||
2075 | /* Make sure there are no machine checks on offlined CPUs. */ | 2154 | /* Make sure there are no machine checks on offlined CPUs. */ |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 786e76a86322..99b57179f912 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c | |||
@@ -523,11 +523,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
523 | { | 523 | { |
524 | int i, err = 0; | 524 | int i, err = 0; |
525 | struct threshold_bank *b = NULL; | 525 | struct threshold_bank *b = NULL; |
526 | struct device *dev = mce_device[cpu]; | 526 | struct device *dev = per_cpu(mce_device, cpu); |
527 | char name[32]; | 527 | char name[32]; |
528 | 528 | ||
529 | sprintf(name, "threshold_bank%i", bank); | 529 | sprintf(name, "threshold_bank%i", bank); |
530 | 530 | ||
531 | #ifdef CONFIG_SMP | ||
531 | if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ | 532 | if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ |
532 | i = cpumask_first(cpu_llc_shared_mask(cpu)); | 533 | i = cpumask_first(cpu_llc_shared_mask(cpu)); |
533 | 534 | ||
@@ -553,6 +554,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
553 | 554 | ||
554 | goto out; | 555 | goto out; |
555 | } | 556 | } |
557 | #endif | ||
556 | 558 | ||
557 | b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); | 559 | b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); |
558 | if (!b) { | 560 | if (!b) { |
@@ -585,7 +587,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
585 | if (i == cpu) | 587 | if (i == cpu) |
586 | continue; | 588 | continue; |
587 | 589 | ||
588 | dev = mce_device[i]; | 590 | dev = per_cpu(mce_device, i); |
589 | if (dev) | 591 | if (dev) |
590 | err = sysfs_create_link(&dev->kobj,b->kobj, name); | 592 | err = sysfs_create_link(&dev->kobj,b->kobj, name); |
591 | if (err) | 593 | if (err) |
@@ -665,7 +667,8 @@ static void threshold_remove_bank(unsigned int cpu, int bank) | |||
665 | #ifdef CONFIG_SMP | 667 | #ifdef CONFIG_SMP |
666 | /* sibling symlink */ | 668 | /* sibling symlink */ |
667 | if (shared_bank[bank] && b->blocks->cpu != cpu) { | 669 | if (shared_bank[bank] && b->blocks->cpu != cpu) { |
668 | sysfs_remove_link(&mce_device[cpu]->kobj, name); | 670 | dev = per_cpu(mce_device, cpu); |
671 | sysfs_remove_link(&dev->kobj, name); | ||
669 | per_cpu(threshold_banks, cpu)[bank] = NULL; | 672 | per_cpu(threshold_banks, cpu)[bank] = NULL; |
670 | 673 | ||
671 | return; | 674 | return; |
@@ -677,7 +680,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) | |||
677 | if (i == cpu) | 680 | if (i == cpu) |
678 | continue; | 681 | continue; |
679 | 682 | ||
680 | dev = mce_device[i]; | 683 | dev = per_cpu(mce_device, i); |
681 | if (dev) | 684 | if (dev) |
682 | sysfs_remove_link(&dev->kobj, name); | 685 | sysfs_remove_link(&dev->kobj, name); |
683 | per_cpu(threshold_banks, i)[bank] = NULL; | 686 | per_cpu(threshold_banks, i)[bank] = NULL; |
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5adce1040b11..fa2900c0e398 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/cpu.h> | 25 | #include <linux/cpu.h> |
26 | #include <linux/bitops.h> | 26 | #include <linux/bitops.h> |
27 | #include <linux/device.h> | ||
27 | 28 | ||
28 | #include <asm/apic.h> | 29 | #include <asm/apic.h> |
29 | #include <asm/stacktrace.h> | 30 | #include <asm/stacktrace.h> |
@@ -31,6 +32,7 @@ | |||
31 | #include <asm/compat.h> | 32 | #include <asm/compat.h> |
32 | #include <asm/smp.h> | 33 | #include <asm/smp.h> |
33 | #include <asm/alternative.h> | 34 | #include <asm/alternative.h> |
35 | #include <asm/timer.h> | ||
34 | 36 | ||
35 | #include "perf_event.h" | 37 | #include "perf_event.h" |
36 | 38 | ||
@@ -351,6 +353,36 @@ int x86_setup_perfctr(struct perf_event *event) | |||
351 | return 0; | 353 | return 0; |
352 | } | 354 | } |
353 | 355 | ||
356 | /* | ||
357 | * check that branch_sample_type is compatible with | ||
358 | * settings needed for precise_ip > 1 which implies | ||
359 | * using the LBR to capture ALL taken branches at the | ||
360 | * priv levels of the measurement | ||
361 | */ | ||
362 | static inline int precise_br_compat(struct perf_event *event) | ||
363 | { | ||
364 | u64 m = event->attr.branch_sample_type; | ||
365 | u64 b = 0; | ||
366 | |||
367 | /* must capture all branches */ | ||
368 | if (!(m & PERF_SAMPLE_BRANCH_ANY)) | ||
369 | return 0; | ||
370 | |||
371 | m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; | ||
372 | |||
373 | if (!event->attr.exclude_user) | ||
374 | b |= PERF_SAMPLE_BRANCH_USER; | ||
375 | |||
376 | if (!event->attr.exclude_kernel) | ||
377 | b |= PERF_SAMPLE_BRANCH_KERNEL; | ||
378 | |||
379 | /* | ||
380 | * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 | ||
381 | */ | ||
382 | |||
383 | return m == b; | ||
384 | } | ||
385 | |||
354 | int x86_pmu_hw_config(struct perf_event *event) | 386 | int x86_pmu_hw_config(struct perf_event *event) |
355 | { | 387 | { |
356 | if (event->attr.precise_ip) { | 388 | if (event->attr.precise_ip) { |
@@ -367,6 +399,36 @@ int x86_pmu_hw_config(struct perf_event *event) | |||
367 | 399 | ||
368 | if (event->attr.precise_ip > precise) | 400 | if (event->attr.precise_ip > precise) |
369 | return -EOPNOTSUPP; | 401 | return -EOPNOTSUPP; |
402 | /* | ||
403 | * check that PEBS LBR correction does not conflict with | ||
404 | * whatever the user is asking with attr->branch_sample_type | ||
405 | */ | ||
406 | if (event->attr.precise_ip > 1) { | ||
407 | u64 *br_type = &event->attr.branch_sample_type; | ||
408 | |||
409 | if (has_branch_stack(event)) { | ||
410 | if (!precise_br_compat(event)) | ||
411 | return -EOPNOTSUPP; | ||
412 | |||
413 | /* branch_sample_type is compatible */ | ||
414 | |||
415 | } else { | ||
416 | /* | ||
417 | * user did not specify branch_sample_type | ||
418 | * | ||
419 | * For PEBS fixups, we capture all | ||
420 | * the branches at the priv level of the | ||
421 | * event. | ||
422 | */ | ||
423 | *br_type = PERF_SAMPLE_BRANCH_ANY; | ||
424 | |||
425 | if (!event->attr.exclude_user) | ||
426 | *br_type |= PERF_SAMPLE_BRANCH_USER; | ||
427 | |||
428 | if (!event->attr.exclude_kernel) | ||
429 | *br_type |= PERF_SAMPLE_BRANCH_KERNEL; | ||
430 | } | ||
431 | } | ||
370 | } | 432 | } |
371 | 433 | ||
372 | /* | 434 | /* |
@@ -424,6 +486,10 @@ static int __x86_pmu_event_init(struct perf_event *event) | |||
424 | /* mark unused */ | 486 | /* mark unused */ |
425 | event->hw.extra_reg.idx = EXTRA_REG_NONE; | 487 | event->hw.extra_reg.idx = EXTRA_REG_NONE; |
426 | 488 | ||
489 | /* mark not used */ | ||
490 | event->hw.extra_reg.idx = EXTRA_REG_NONE; | ||
491 | event->hw.branch_reg.idx = EXTRA_REG_NONE; | ||
492 | |||
427 | return x86_pmu.hw_config(event); | 493 | return x86_pmu.hw_config(event); |
428 | } | 494 | } |
429 | 495 | ||
@@ -577,14 +643,14 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) | |||
577 | /* Prefer fixed purpose counters */ | 643 | /* Prefer fixed purpose counters */ |
578 | if (x86_pmu.num_counters_fixed) { | 644 | if (x86_pmu.num_counters_fixed) { |
579 | idx = X86_PMC_IDX_FIXED; | 645 | idx = X86_PMC_IDX_FIXED; |
580 | for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { | 646 | for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { |
581 | if (!__test_and_set_bit(idx, sched->state.used)) | 647 | if (!__test_and_set_bit(idx, sched->state.used)) |
582 | goto done; | 648 | goto done; |
583 | } | 649 | } |
584 | } | 650 | } |
585 | /* Grab the first unused counter starting with idx */ | 651 | /* Grab the first unused counter starting with idx */ |
586 | idx = sched->state.counter; | 652 | idx = sched->state.counter; |
587 | for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { | 653 | for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) { |
588 | if (!__test_and_set_bit(idx, sched->state.used)) | 654 | if (!__test_and_set_bit(idx, sched->state.used)) |
589 | goto done; | 655 | goto done; |
590 | } | 656 | } |
@@ -1210,6 +1276,8 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) | |||
1210 | break; | 1276 | break; |
1211 | 1277 | ||
1212 | case CPU_STARTING: | 1278 | case CPU_STARTING: |
1279 | if (x86_pmu.attr_rdpmc) | ||
1280 | set_in_cr4(X86_CR4_PCE); | ||
1213 | if (x86_pmu.cpu_starting) | 1281 | if (x86_pmu.cpu_starting) |
1214 | x86_pmu.cpu_starting(cpu); | 1282 | x86_pmu.cpu_starting(cpu); |
1215 | break; | 1283 | break; |
@@ -1319,6 +1387,8 @@ static int __init init_hw_perf_events(void) | |||
1319 | } | 1387 | } |
1320 | } | 1388 | } |
1321 | 1389 | ||
1390 | x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ | ||
1391 | |||
1322 | pr_info("... version: %d\n", x86_pmu.version); | 1392 | pr_info("... version: %d\n", x86_pmu.version); |
1323 | pr_info("... bit width: %d\n", x86_pmu.cntval_bits); | 1393 | pr_info("... bit width: %d\n", x86_pmu.cntval_bits); |
1324 | pr_info("... generic registers: %d\n", x86_pmu.num_counters); | 1394 | pr_info("... generic registers: %d\n", x86_pmu.num_counters); |
@@ -1542,23 +1612,106 @@ static int x86_pmu_event_init(struct perf_event *event) | |||
1542 | return err; | 1612 | return err; |
1543 | } | 1613 | } |
1544 | 1614 | ||
1615 | static int x86_pmu_event_idx(struct perf_event *event) | ||
1616 | { | ||
1617 | int idx = event->hw.idx; | ||
1618 | |||
1619 | if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { | ||
1620 | idx -= X86_PMC_IDX_FIXED; | ||
1621 | idx |= 1 << 30; | ||
1622 | } | ||
1623 | |||
1624 | return idx + 1; | ||
1625 | } | ||
1626 | |||
1627 | static ssize_t get_attr_rdpmc(struct device *cdev, | ||
1628 | struct device_attribute *attr, | ||
1629 | char *buf) | ||
1630 | { | ||
1631 | return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); | ||
1632 | } | ||
1633 | |||
1634 | static void change_rdpmc(void *info) | ||
1635 | { | ||
1636 | bool enable = !!(unsigned long)info; | ||
1637 | |||
1638 | if (enable) | ||
1639 | set_in_cr4(X86_CR4_PCE); | ||
1640 | else | ||
1641 | clear_in_cr4(X86_CR4_PCE); | ||
1642 | } | ||
1643 | |||
1644 | static ssize_t set_attr_rdpmc(struct device *cdev, | ||
1645 | struct device_attribute *attr, | ||
1646 | const char *buf, size_t count) | ||
1647 | { | ||
1648 | unsigned long val = simple_strtoul(buf, NULL, 0); | ||
1649 | |||
1650 | if (!!val != !!x86_pmu.attr_rdpmc) { | ||
1651 | x86_pmu.attr_rdpmc = !!val; | ||
1652 | smp_call_function(change_rdpmc, (void *)val, 1); | ||
1653 | } | ||
1654 | |||
1655 | return count; | ||
1656 | } | ||
1657 | |||
1658 | static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); | ||
1659 | |||
1660 | static struct attribute *x86_pmu_attrs[] = { | ||
1661 | &dev_attr_rdpmc.attr, | ||
1662 | NULL, | ||
1663 | }; | ||
1664 | |||
1665 | static struct attribute_group x86_pmu_attr_group = { | ||
1666 | .attrs = x86_pmu_attrs, | ||
1667 | }; | ||
1668 | |||
1669 | static const struct attribute_group *x86_pmu_attr_groups[] = { | ||
1670 | &x86_pmu_attr_group, | ||
1671 | NULL, | ||
1672 | }; | ||
1673 | |||
1674 | static void x86_pmu_flush_branch_stack(void) | ||
1675 | { | ||
1676 | if (x86_pmu.flush_branch_stack) | ||
1677 | x86_pmu.flush_branch_stack(); | ||
1678 | } | ||
1679 | |||
1545 | static struct pmu pmu = { | 1680 | static struct pmu pmu = { |
1546 | .pmu_enable = x86_pmu_enable, | 1681 | .pmu_enable = x86_pmu_enable, |
1547 | .pmu_disable = x86_pmu_disable, | 1682 | .pmu_disable = x86_pmu_disable, |
1683 | |||
1684 | .attr_groups = x86_pmu_attr_groups, | ||
1548 | 1685 | ||
1549 | .event_init = x86_pmu_event_init, | 1686 | .event_init = x86_pmu_event_init, |
1550 | 1687 | ||
1551 | .add = x86_pmu_add, | 1688 | .add = x86_pmu_add, |
1552 | .del = x86_pmu_del, | 1689 | .del = x86_pmu_del, |
1553 | .start = x86_pmu_start, | 1690 | .start = x86_pmu_start, |
1554 | .stop = x86_pmu_stop, | 1691 | .stop = x86_pmu_stop, |
1555 | .read = x86_pmu_read, | 1692 | .read = x86_pmu_read, |
1556 | 1693 | ||
1557 | .start_txn = x86_pmu_start_txn, | 1694 | .start_txn = x86_pmu_start_txn, |
1558 | .cancel_txn = x86_pmu_cancel_txn, | 1695 | .cancel_txn = x86_pmu_cancel_txn, |
1559 | .commit_txn = x86_pmu_commit_txn, | 1696 | .commit_txn = x86_pmu_commit_txn, |
1697 | |||
1698 | .event_idx = x86_pmu_event_idx, | ||
1699 | .flush_branch_stack = x86_pmu_flush_branch_stack, | ||
1560 | }; | 1700 | }; |
1561 | 1701 | ||
1702 | void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) | ||
1703 | { | ||
1704 | if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) | ||
1705 | return; | ||
1706 | |||
1707 | if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) | ||
1708 | return; | ||
1709 | |||
1710 | userpg->time_mult = this_cpu_read(cyc2ns); | ||
1711 | userpg->time_shift = CYC2NS_SCALE_FACTOR; | ||
1712 | userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; | ||
1713 | } | ||
1714 | |||
1562 | /* | 1715 | /* |
1563 | * callchain support | 1716 | * callchain support |
1564 | */ | 1717 | */ |
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 8944062f46e2..8484e77c211e 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h | |||
@@ -33,6 +33,7 @@ enum extra_reg_type { | |||
33 | 33 | ||
34 | EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ | 34 | EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ |
35 | EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ | 35 | EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ |
36 | EXTRA_REG_LBR = 2, /* lbr_select */ | ||
36 | 37 | ||
37 | EXTRA_REG_MAX /* number of entries needed */ | 38 | EXTRA_REG_MAX /* number of entries needed */ |
38 | }; | 39 | }; |
@@ -130,6 +131,8 @@ struct cpu_hw_events { | |||
130 | void *lbr_context; | 131 | void *lbr_context; |
131 | struct perf_branch_stack lbr_stack; | 132 | struct perf_branch_stack lbr_stack; |
132 | struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; | 133 | struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; |
134 | struct er_account *lbr_sel; | ||
135 | u64 br_sel; | ||
133 | 136 | ||
134 | /* | 137 | /* |
135 | * Intel host/guest exclude bits | 138 | * Intel host/guest exclude bits |
@@ -147,7 +150,9 @@ struct cpu_hw_events { | |||
147 | /* | 150 | /* |
148 | * AMD specific bits | 151 | * AMD specific bits |
149 | */ | 152 | */ |
150 | struct amd_nb *amd_nb; | 153 | struct amd_nb *amd_nb; |
154 | /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ | ||
155 | u64 perf_ctr_virt_mask; | ||
151 | 156 | ||
152 | void *kfree_on_online; | 157 | void *kfree_on_online; |
153 | }; | 158 | }; |
@@ -266,6 +271,29 @@ struct x86_pmu_quirk { | |||
266 | void (*func)(void); | 271 | void (*func)(void); |
267 | }; | 272 | }; |
268 | 273 | ||
274 | union x86_pmu_config { | ||
275 | struct { | ||
276 | u64 event:8, | ||
277 | umask:8, | ||
278 | usr:1, | ||
279 | os:1, | ||
280 | edge:1, | ||
281 | pc:1, | ||
282 | interrupt:1, | ||
283 | __reserved1:1, | ||
284 | en:1, | ||
285 | inv:1, | ||
286 | cmask:8, | ||
287 | event2:4, | ||
288 | __reserved2:4, | ||
289 | go:1, | ||
290 | ho:1; | ||
291 | } bits; | ||
292 | u64 value; | ||
293 | }; | ||
294 | |||
295 | #define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value | ||
296 | |||
269 | /* | 297 | /* |
270 | * struct x86_pmu - generic x86 pmu | 298 | * struct x86_pmu - generic x86 pmu |
271 | */ | 299 | */ |
@@ -307,10 +335,19 @@ struct x86_pmu { | |||
307 | struct x86_pmu_quirk *quirks; | 335 | struct x86_pmu_quirk *quirks; |
308 | int perfctr_second_write; | 336 | int perfctr_second_write; |
309 | 337 | ||
338 | /* | ||
339 | * sysfs attrs | ||
340 | */ | ||
341 | int attr_rdpmc; | ||
342 | |||
343 | /* | ||
344 | * CPU Hotplug hooks | ||
345 | */ | ||
310 | int (*cpu_prepare)(int cpu); | 346 | int (*cpu_prepare)(int cpu); |
311 | void (*cpu_starting)(int cpu); | 347 | void (*cpu_starting)(int cpu); |
312 | void (*cpu_dying)(int cpu); | 348 | void (*cpu_dying)(int cpu); |
313 | void (*cpu_dead)(int cpu); | 349 | void (*cpu_dead)(int cpu); |
350 | void (*flush_branch_stack)(void); | ||
314 | 351 | ||
315 | /* | 352 | /* |
316 | * Intel Arch Perfmon v2+ | 353 | * Intel Arch Perfmon v2+ |
@@ -332,6 +369,8 @@ struct x86_pmu { | |||
332 | */ | 369 | */ |
333 | unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ | 370 | unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ |
334 | int lbr_nr; /* hardware stack size */ | 371 | int lbr_nr; /* hardware stack size */ |
372 | u64 lbr_sel_mask; /* LBR_SELECT valid bits */ | ||
373 | const int *lbr_sel_map; /* lbr_select mappings */ | ||
335 | 374 | ||
336 | /* | 375 | /* |
337 | * Extra registers for events | 376 | * Extra registers for events |
@@ -417,9 +456,11 @@ void x86_pmu_disable_all(void); | |||
417 | static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, | 456 | static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, |
418 | u64 enable_mask) | 457 | u64 enable_mask) |
419 | { | 458 | { |
459 | u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask); | ||
460 | |||
420 | if (hwc->extra_reg.reg) | 461 | if (hwc->extra_reg.reg) |
421 | wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); | 462 | wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); |
422 | wrmsrl(hwc->config_base, hwc->config | enable_mask); | 463 | wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask); |
423 | } | 464 | } |
424 | 465 | ||
425 | void x86_pmu_enable_all(int added); | 466 | void x86_pmu_enable_all(int added); |
@@ -443,6 +484,15 @@ extern struct event_constraint emptyconstraint; | |||
443 | 484 | ||
444 | extern struct event_constraint unconstrained; | 485 | extern struct event_constraint unconstrained; |
445 | 486 | ||
487 | static inline bool kernel_ip(unsigned long ip) | ||
488 | { | ||
489 | #ifdef CONFIG_X86_32 | ||
490 | return ip > PAGE_OFFSET; | ||
491 | #else | ||
492 | return (long)ip < 0; | ||
493 | #endif | ||
494 | } | ||
495 | |||
446 | #ifdef CONFIG_CPU_SUP_AMD | 496 | #ifdef CONFIG_CPU_SUP_AMD |
447 | 497 | ||
448 | int amd_pmu_init(void); | 498 | int amd_pmu_init(void); |
@@ -523,6 +573,10 @@ void intel_pmu_lbr_init_nhm(void); | |||
523 | 573 | ||
524 | void intel_pmu_lbr_init_atom(void); | 574 | void intel_pmu_lbr_init_atom(void); |
525 | 575 | ||
576 | void intel_pmu_lbr_init_snb(void); | ||
577 | |||
578 | int intel_pmu_setup_lbr_filter(struct perf_event *event); | ||
579 | |||
526 | int p4_pmu_init(void); | 580 | int p4_pmu_init(void); |
527 | 581 | ||
528 | int p6_pmu_init(void); | 582 | int p6_pmu_init(void); |
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 0397b23be8e9..dd002faff7a6 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c | |||
@@ -1,4 +1,5 @@ | |||
1 | #include <linux/perf_event.h> | 1 | #include <linux/perf_event.h> |
2 | #include <linux/export.h> | ||
2 | #include <linux/types.h> | 3 | #include <linux/types.h> |
3 | #include <linux/init.h> | 4 | #include <linux/init.h> |
4 | #include <linux/slab.h> | 5 | #include <linux/slab.h> |
@@ -138,6 +139,9 @@ static int amd_pmu_hw_config(struct perf_event *event) | |||
138 | if (ret) | 139 | if (ret) |
139 | return ret; | 140 | return ret; |
140 | 141 | ||
142 | if (has_branch_stack(event)) | ||
143 | return -EOPNOTSUPP; | ||
144 | |||
141 | if (event->attr.exclude_host && event->attr.exclude_guest) | 145 | if (event->attr.exclude_host && event->attr.exclude_guest) |
142 | /* | 146 | /* |
143 | * When HO == GO == 1 the hardware treats that as GO == HO == 0 | 147 | * When HO == GO == 1 the hardware treats that as GO == HO == 0 |
@@ -357,7 +361,9 @@ static void amd_pmu_cpu_starting(int cpu) | |||
357 | struct amd_nb *nb; | 361 | struct amd_nb *nb; |
358 | int i, nb_id; | 362 | int i, nb_id; |
359 | 363 | ||
360 | if (boot_cpu_data.x86_max_cores < 2) | 364 | cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; |
365 | |||
366 | if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15) | ||
361 | return; | 367 | return; |
362 | 368 | ||
363 | nb_id = amd_get_nb_id(cpu); | 369 | nb_id = amd_get_nb_id(cpu); |
@@ -587,9 +593,9 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { | |||
587 | .put_event_constraints = amd_put_event_constraints, | 593 | .put_event_constraints = amd_put_event_constraints, |
588 | 594 | ||
589 | .cpu_prepare = amd_pmu_cpu_prepare, | 595 | .cpu_prepare = amd_pmu_cpu_prepare, |
590 | .cpu_starting = amd_pmu_cpu_starting, | ||
591 | .cpu_dead = amd_pmu_cpu_dead, | 596 | .cpu_dead = amd_pmu_cpu_dead, |
592 | #endif | 597 | #endif |
598 | .cpu_starting = amd_pmu_cpu_starting, | ||
593 | }; | 599 | }; |
594 | 600 | ||
595 | __init int amd_pmu_init(void) | 601 | __init int amd_pmu_init(void) |
@@ -621,3 +627,33 @@ __init int amd_pmu_init(void) | |||
621 | 627 | ||
622 | return 0; | 628 | return 0; |
623 | } | 629 | } |
630 | |||
631 | void amd_pmu_enable_virt(void) | ||
632 | { | ||
633 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
634 | |||
635 | cpuc->perf_ctr_virt_mask = 0; | ||
636 | |||
637 | /* Reload all events */ | ||
638 | x86_pmu_disable_all(); | ||
639 | x86_pmu_enable_all(0); | ||
640 | } | ||
641 | EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); | ||
642 | |||
643 | void amd_pmu_disable_virt(void) | ||
644 | { | ||
645 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
646 | |||
647 | /* | ||
648 | * We only mask out the Host-only bit so that host-only counting works | ||
649 | * when SVM is disabled. If someone sets up a guest-only counter when | ||
650 | * SVM is disabled the Guest-only bits still gets set and the counter | ||
651 | * will not count anything. | ||
652 | */ | ||
653 | cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; | ||
654 | |||
655 | /* Reload all events */ | ||
656 | x86_pmu_disable_all(); | ||
657 | x86_pmu_enable_all(0); | ||
658 | } | ||
659 | EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); | ||
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3bd37bdf1b8e..6a84e7f28f05 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c | |||
@@ -385,14 +385,15 @@ static __initconst const u64 westmere_hw_cache_event_ids | |||
385 | #define NHM_LOCAL_DRAM (1 << 14) | 385 | #define NHM_LOCAL_DRAM (1 << 14) |
386 | #define NHM_NON_DRAM (1 << 15) | 386 | #define NHM_NON_DRAM (1 << 15) |
387 | 387 | ||
388 | #define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM) | 388 | #define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD) |
389 | #define NHM_REMOTE (NHM_REMOTE_DRAM) | ||
389 | 390 | ||
390 | #define NHM_DMND_READ (NHM_DMND_DATA_RD) | 391 | #define NHM_DMND_READ (NHM_DMND_DATA_RD) |
391 | #define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) | 392 | #define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) |
392 | #define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) | 393 | #define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) |
393 | 394 | ||
394 | #define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) | 395 | #define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) |
395 | #define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD) | 396 | #define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD) |
396 | #define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) | 397 | #define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) |
397 | 398 | ||
398 | static __initconst const u64 nehalem_hw_cache_extra_regs | 399 | static __initconst const u64 nehalem_hw_cache_extra_regs |
@@ -416,16 +417,16 @@ static __initconst const u64 nehalem_hw_cache_extra_regs | |||
416 | }, | 417 | }, |
417 | [ C(NODE) ] = { | 418 | [ C(NODE) ] = { |
418 | [ C(OP_READ) ] = { | 419 | [ C(OP_READ) ] = { |
419 | [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM, | 420 | [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE, |
420 | [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM, | 421 | [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE, |
421 | }, | 422 | }, |
422 | [ C(OP_WRITE) ] = { | 423 | [ C(OP_WRITE) ] = { |
423 | [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM, | 424 | [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE, |
424 | [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM, | 425 | [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE, |
425 | }, | 426 | }, |
426 | [ C(OP_PREFETCH) ] = { | 427 | [ C(OP_PREFETCH) ] = { |
427 | [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM, | 428 | [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE, |
428 | [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM, | 429 | [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE, |
429 | }, | 430 | }, |
430 | }, | 431 | }, |
431 | }; | 432 | }; |
@@ -727,6 +728,19 @@ static __initconst const u64 atom_hw_cache_event_ids | |||
727 | }, | 728 | }, |
728 | }; | 729 | }; |
729 | 730 | ||
731 | static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) | ||
732 | { | ||
733 | /* user explicitly requested branch sampling */ | ||
734 | if (has_branch_stack(event)) | ||
735 | return true; | ||
736 | |||
737 | /* implicit branch sampling to correct PEBS skid */ | ||
738 | if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) | ||
739 | return true; | ||
740 | |||
741 | return false; | ||
742 | } | ||
743 | |||
730 | static void intel_pmu_disable_all(void) | 744 | static void intel_pmu_disable_all(void) |
731 | { | 745 | { |
732 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 746 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
@@ -881,6 +895,13 @@ static void intel_pmu_disable_event(struct perf_event *event) | |||
881 | cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); | 895 | cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); |
882 | cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); | 896 | cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); |
883 | 897 | ||
898 | /* | ||
899 | * must disable before any actual event | ||
900 | * because any event may be combined with LBR | ||
901 | */ | ||
902 | if (intel_pmu_needs_lbr_smpl(event)) | ||
903 | intel_pmu_lbr_disable(event); | ||
904 | |||
884 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { | 905 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { |
885 | intel_pmu_disable_fixed(hwc); | 906 | intel_pmu_disable_fixed(hwc); |
886 | return; | 907 | return; |
@@ -935,6 +956,12 @@ static void intel_pmu_enable_event(struct perf_event *event) | |||
935 | intel_pmu_enable_bts(hwc->config); | 956 | intel_pmu_enable_bts(hwc->config); |
936 | return; | 957 | return; |
937 | } | 958 | } |
959 | /* | ||
960 | * must enabled before any actual event | ||
961 | * because any event may be combined with LBR | ||
962 | */ | ||
963 | if (intel_pmu_needs_lbr_smpl(event)) | ||
964 | intel_pmu_lbr_enable(event); | ||
938 | 965 | ||
939 | if (event->attr.exclude_host) | 966 | if (event->attr.exclude_host) |
940 | cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); | 967 | cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); |
@@ -1057,6 +1084,9 @@ again: | |||
1057 | 1084 | ||
1058 | data.period = event->hw.last_period; | 1085 | data.period = event->hw.last_period; |
1059 | 1086 | ||
1087 | if (has_branch_stack(event)) | ||
1088 | data.br_stack = &cpuc->lbr_stack; | ||
1089 | |||
1060 | if (perf_event_overflow(event, &data, regs)) | 1090 | if (perf_event_overflow(event, &data, regs)) |
1061 | x86_pmu_stop(event, 0); | 1091 | x86_pmu_stop(event, 0); |
1062 | } | 1092 | } |
@@ -1123,17 +1153,17 @@ static bool intel_try_alt_er(struct perf_event *event, int orig_idx) | |||
1123 | */ | 1153 | */ |
1124 | static struct event_constraint * | 1154 | static struct event_constraint * |
1125 | __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, | 1155 | __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, |
1126 | struct perf_event *event) | 1156 | struct perf_event *event, |
1157 | struct hw_perf_event_extra *reg) | ||
1127 | { | 1158 | { |
1128 | struct event_constraint *c = &emptyconstraint; | 1159 | struct event_constraint *c = &emptyconstraint; |
1129 | struct hw_perf_event_extra *reg = &event->hw.extra_reg; | ||
1130 | struct er_account *era; | 1160 | struct er_account *era; |
1131 | unsigned long flags; | 1161 | unsigned long flags; |
1132 | int orig_idx = reg->idx; | 1162 | int orig_idx = reg->idx; |
1133 | 1163 | ||
1134 | /* already allocated shared msr */ | 1164 | /* already allocated shared msr */ |
1135 | if (reg->alloc) | 1165 | if (reg->alloc) |
1136 | return &unconstrained; | 1166 | return NULL; /* call x86_get_event_constraint() */ |
1137 | 1167 | ||
1138 | again: | 1168 | again: |
1139 | era = &cpuc->shared_regs->regs[reg->idx]; | 1169 | era = &cpuc->shared_regs->regs[reg->idx]; |
@@ -1156,14 +1186,10 @@ again: | |||
1156 | reg->alloc = 1; | 1186 | reg->alloc = 1; |
1157 | 1187 | ||
1158 | /* | 1188 | /* |
1159 | * All events using extra_reg are unconstrained. | 1189 | * need to call x86_get_event_constraint() |
1160 | * Avoids calling x86_get_event_constraints() | 1190 | * to check if associated event has constraints |
1161 | * | ||
1162 | * Must revisit if extra_reg controlling events | ||
1163 | * ever have constraints. Worst case we go through | ||
1164 | * the regular event constraint table. | ||
1165 | */ | 1191 | */ |
1166 | c = &unconstrained; | 1192 | c = NULL; |
1167 | } else if (intel_try_alt_er(event, orig_idx)) { | 1193 | } else if (intel_try_alt_er(event, orig_idx)) { |
1168 | raw_spin_unlock_irqrestore(&era->lock, flags); | 1194 | raw_spin_unlock_irqrestore(&era->lock, flags); |
1169 | goto again; | 1195 | goto again; |
@@ -1200,11 +1226,23 @@ static struct event_constraint * | |||
1200 | intel_shared_regs_constraints(struct cpu_hw_events *cpuc, | 1226 | intel_shared_regs_constraints(struct cpu_hw_events *cpuc, |
1201 | struct perf_event *event) | 1227 | struct perf_event *event) |
1202 | { | 1228 | { |
1203 | struct event_constraint *c = NULL; | 1229 | struct event_constraint *c = NULL, *d; |
1204 | 1230 | struct hw_perf_event_extra *xreg, *breg; | |
1205 | if (event->hw.extra_reg.idx != EXTRA_REG_NONE) | 1231 | |
1206 | c = __intel_shared_reg_get_constraints(cpuc, event); | 1232 | xreg = &event->hw.extra_reg; |
1207 | 1233 | if (xreg->idx != EXTRA_REG_NONE) { | |
1234 | c = __intel_shared_reg_get_constraints(cpuc, event, xreg); | ||
1235 | if (c == &emptyconstraint) | ||
1236 | return c; | ||
1237 | } | ||
1238 | breg = &event->hw.branch_reg; | ||
1239 | if (breg->idx != EXTRA_REG_NONE) { | ||
1240 | d = __intel_shared_reg_get_constraints(cpuc, event, breg); | ||
1241 | if (d == &emptyconstraint) { | ||
1242 | __intel_shared_reg_put_constraints(cpuc, xreg); | ||
1243 | c = d; | ||
1244 | } | ||
1245 | } | ||
1208 | return c; | 1246 | return c; |
1209 | } | 1247 | } |
1210 | 1248 | ||
@@ -1252,6 +1290,10 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, | |||
1252 | reg = &event->hw.extra_reg; | 1290 | reg = &event->hw.extra_reg; |
1253 | if (reg->idx != EXTRA_REG_NONE) | 1291 | if (reg->idx != EXTRA_REG_NONE) |
1254 | __intel_shared_reg_put_constraints(cpuc, reg); | 1292 | __intel_shared_reg_put_constraints(cpuc, reg); |
1293 | |||
1294 | reg = &event->hw.branch_reg; | ||
1295 | if (reg->idx != EXTRA_REG_NONE) | ||
1296 | __intel_shared_reg_put_constraints(cpuc, reg); | ||
1255 | } | 1297 | } |
1256 | 1298 | ||
1257 | static void intel_put_event_constraints(struct cpu_hw_events *cpuc, | 1299 | static void intel_put_event_constraints(struct cpu_hw_events *cpuc, |
@@ -1287,12 +1329,19 @@ static int intel_pmu_hw_config(struct perf_event *event) | |||
1287 | * | 1329 | * |
1288 | * Thereby we gain a PEBS capable cycle counter. | 1330 | * Thereby we gain a PEBS capable cycle counter. |
1289 | */ | 1331 | */ |
1290 | u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */ | 1332 | u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); |
1333 | |||
1291 | 1334 | ||
1292 | alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); | 1335 | alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); |
1293 | event->hw.config = alt_config; | 1336 | event->hw.config = alt_config; |
1294 | } | 1337 | } |
1295 | 1338 | ||
1339 | if (intel_pmu_needs_lbr_smpl(event)) { | ||
1340 | ret = intel_pmu_setup_lbr_filter(event); | ||
1341 | if (ret) | ||
1342 | return ret; | ||
1343 | } | ||
1344 | |||
1296 | if (event->attr.type != PERF_TYPE_RAW) | 1345 | if (event->attr.type != PERF_TYPE_RAW) |
1297 | return 0; | 1346 | return 0; |
1298 | 1347 | ||
@@ -1431,7 +1480,7 @@ static int intel_pmu_cpu_prepare(int cpu) | |||
1431 | { | 1480 | { |
1432 | struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); | 1481 | struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); |
1433 | 1482 | ||
1434 | if (!x86_pmu.extra_regs) | 1483 | if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map)) |
1435 | return NOTIFY_OK; | 1484 | return NOTIFY_OK; |
1436 | 1485 | ||
1437 | cpuc->shared_regs = allocate_shared_regs(cpu); | 1486 | cpuc->shared_regs = allocate_shared_regs(cpu); |
@@ -1453,22 +1502,28 @@ static void intel_pmu_cpu_starting(int cpu) | |||
1453 | */ | 1502 | */ |
1454 | intel_pmu_lbr_reset(); | 1503 | intel_pmu_lbr_reset(); |
1455 | 1504 | ||
1456 | if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING)) | 1505 | cpuc->lbr_sel = NULL; |
1506 | |||
1507 | if (!cpuc->shared_regs) | ||
1457 | return; | 1508 | return; |
1458 | 1509 | ||
1459 | for_each_cpu(i, topology_thread_cpumask(cpu)) { | 1510 | if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) { |
1460 | struct intel_shared_regs *pc; | 1511 | for_each_cpu(i, topology_thread_cpumask(cpu)) { |
1512 | struct intel_shared_regs *pc; | ||
1461 | 1513 | ||
1462 | pc = per_cpu(cpu_hw_events, i).shared_regs; | 1514 | pc = per_cpu(cpu_hw_events, i).shared_regs; |
1463 | if (pc && pc->core_id == core_id) { | 1515 | if (pc && pc->core_id == core_id) { |
1464 | cpuc->kfree_on_online = cpuc->shared_regs; | 1516 | cpuc->kfree_on_online = cpuc->shared_regs; |
1465 | cpuc->shared_regs = pc; | 1517 | cpuc->shared_regs = pc; |
1466 | break; | 1518 | break; |
1519 | } | ||
1467 | } | 1520 | } |
1521 | cpuc->shared_regs->core_id = core_id; | ||
1522 | cpuc->shared_regs->refcnt++; | ||
1468 | } | 1523 | } |
1469 | 1524 | ||
1470 | cpuc->shared_regs->core_id = core_id; | 1525 | if (x86_pmu.lbr_sel_map) |
1471 | cpuc->shared_regs->refcnt++; | 1526 | cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; |
1472 | } | 1527 | } |
1473 | 1528 | ||
1474 | static void intel_pmu_cpu_dying(int cpu) | 1529 | static void intel_pmu_cpu_dying(int cpu) |
@@ -1486,6 +1541,18 @@ static void intel_pmu_cpu_dying(int cpu) | |||
1486 | fini_debug_store_on_cpu(cpu); | 1541 | fini_debug_store_on_cpu(cpu); |
1487 | } | 1542 | } |
1488 | 1543 | ||
1544 | static void intel_pmu_flush_branch_stack(void) | ||
1545 | { | ||
1546 | /* | ||
1547 | * Intel LBR does not tag entries with the | ||
1548 | * PID of the current task, then we need to | ||
1549 | * flush it on ctxsw | ||
1550 | * For now, we simply reset it | ||
1551 | */ | ||
1552 | if (x86_pmu.lbr_nr) | ||
1553 | intel_pmu_lbr_reset(); | ||
1554 | } | ||
1555 | |||
1489 | static __initconst const struct x86_pmu intel_pmu = { | 1556 | static __initconst const struct x86_pmu intel_pmu = { |
1490 | .name = "Intel", | 1557 | .name = "Intel", |
1491 | .handle_irq = intel_pmu_handle_irq, | 1558 | .handle_irq = intel_pmu_handle_irq, |
@@ -1513,6 +1580,7 @@ static __initconst const struct x86_pmu intel_pmu = { | |||
1513 | .cpu_starting = intel_pmu_cpu_starting, | 1580 | .cpu_starting = intel_pmu_cpu_starting, |
1514 | .cpu_dying = intel_pmu_cpu_dying, | 1581 | .cpu_dying = intel_pmu_cpu_dying, |
1515 | .guest_get_msrs = intel_guest_get_msrs, | 1582 | .guest_get_msrs = intel_guest_get_msrs, |
1583 | .flush_branch_stack = intel_pmu_flush_branch_stack, | ||
1516 | }; | 1584 | }; |
1517 | 1585 | ||
1518 | static __init void intel_clovertown_quirk(void) | 1586 | static __init void intel_clovertown_quirk(void) |
@@ -1689,9 +1757,11 @@ __init int intel_pmu_init(void) | |||
1689 | x86_pmu.extra_regs = intel_nehalem_extra_regs; | 1757 | x86_pmu.extra_regs = intel_nehalem_extra_regs; |
1690 | 1758 | ||
1691 | /* UOPS_ISSUED.STALLED_CYCLES */ | 1759 | /* UOPS_ISSUED.STALLED_CYCLES */ |
1692 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; | 1760 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = |
1761 | X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); | ||
1693 | /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ | 1762 | /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ |
1694 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; | 1763 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = |
1764 | X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); | ||
1695 | 1765 | ||
1696 | x86_add_quirk(intel_nehalem_quirk); | 1766 | x86_add_quirk(intel_nehalem_quirk); |
1697 | 1767 | ||
@@ -1726,9 +1796,11 @@ __init int intel_pmu_init(void) | |||
1726 | x86_pmu.er_flags |= ERF_HAS_RSP_1; | 1796 | x86_pmu.er_flags |= ERF_HAS_RSP_1; |
1727 | 1797 | ||
1728 | /* UOPS_ISSUED.STALLED_CYCLES */ | 1798 | /* UOPS_ISSUED.STALLED_CYCLES */ |
1729 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; | 1799 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = |
1800 | X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); | ||
1730 | /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ | 1801 | /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ |
1731 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; | 1802 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = |
1803 | X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); | ||
1732 | 1804 | ||
1733 | pr_cont("Westmere events, "); | 1805 | pr_cont("Westmere events, "); |
1734 | break; | 1806 | break; |
@@ -1739,7 +1811,7 @@ __init int intel_pmu_init(void) | |||
1739 | memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, | 1811 | memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, |
1740 | sizeof(hw_cache_event_ids)); | 1812 | sizeof(hw_cache_event_ids)); |
1741 | 1813 | ||
1742 | intel_pmu_lbr_init_nhm(); | 1814 | intel_pmu_lbr_init_snb(); |
1743 | 1815 | ||
1744 | x86_pmu.event_constraints = intel_snb_event_constraints; | 1816 | x86_pmu.event_constraints = intel_snb_event_constraints; |
1745 | x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; | 1817 | x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; |
@@ -1749,9 +1821,11 @@ __init int intel_pmu_init(void) | |||
1749 | x86_pmu.er_flags |= ERF_NO_HT_SHARING; | 1821 | x86_pmu.er_flags |= ERF_NO_HT_SHARING; |
1750 | 1822 | ||
1751 | /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ | 1823 | /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ |
1752 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; | 1824 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = |
1825 | X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); | ||
1753 | /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ | 1826 | /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ |
1754 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1; | 1827 | intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = |
1828 | X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1); | ||
1755 | 1829 | ||
1756 | pr_cont("SandyBridge events, "); | 1830 | pr_cont("SandyBridge events, "); |
1757 | break; | 1831 | break; |
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 73da6b64f5b7..7f64df19e7dd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c | |||
@@ -3,6 +3,7 @@ | |||
3 | #include <linux/slab.h> | 3 | #include <linux/slab.h> |
4 | 4 | ||
5 | #include <asm/perf_event.h> | 5 | #include <asm/perf_event.h> |
6 | #include <asm/insn.h> | ||
6 | 7 | ||
7 | #include "perf_event.h" | 8 | #include "perf_event.h" |
8 | 9 | ||
@@ -439,10 +440,6 @@ void intel_pmu_pebs_enable(struct perf_event *event) | |||
439 | hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; | 440 | hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; |
440 | 441 | ||
441 | cpuc->pebs_enabled |= 1ULL << hwc->idx; | 442 | cpuc->pebs_enabled |= 1ULL << hwc->idx; |
442 | WARN_ON_ONCE(cpuc->enabled); | ||
443 | |||
444 | if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) | ||
445 | intel_pmu_lbr_enable(event); | ||
446 | } | 443 | } |
447 | 444 | ||
448 | void intel_pmu_pebs_disable(struct perf_event *event) | 445 | void intel_pmu_pebs_disable(struct perf_event *event) |
@@ -455,9 +452,6 @@ void intel_pmu_pebs_disable(struct perf_event *event) | |||
455 | wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); | 452 | wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); |
456 | 453 | ||
457 | hwc->config |= ARCH_PERFMON_EVENTSEL_INT; | 454 | hwc->config |= ARCH_PERFMON_EVENTSEL_INT; |
458 | |||
459 | if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) | ||
460 | intel_pmu_lbr_disable(event); | ||
461 | } | 455 | } |
462 | 456 | ||
463 | void intel_pmu_pebs_enable_all(void) | 457 | void intel_pmu_pebs_enable_all(void) |
@@ -476,17 +470,6 @@ void intel_pmu_pebs_disable_all(void) | |||
476 | wrmsrl(MSR_IA32_PEBS_ENABLE, 0); | 470 | wrmsrl(MSR_IA32_PEBS_ENABLE, 0); |
477 | } | 471 | } |
478 | 472 | ||
479 | #include <asm/insn.h> | ||
480 | |||
481 | static inline bool kernel_ip(unsigned long ip) | ||
482 | { | ||
483 | #ifdef CONFIG_X86_32 | ||
484 | return ip > PAGE_OFFSET; | ||
485 | #else | ||
486 | return (long)ip < 0; | ||
487 | #endif | ||
488 | } | ||
489 | |||
490 | static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) | 473 | static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) |
491 | { | 474 | { |
492 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 475 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
@@ -573,6 +556,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, | |||
573 | * both formats and we don't use the other fields in this | 556 | * both formats and we don't use the other fields in this |
574 | * routine. | 557 | * routine. |
575 | */ | 558 | */ |
559 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
576 | struct pebs_record_core *pebs = __pebs; | 560 | struct pebs_record_core *pebs = __pebs; |
577 | struct perf_sample_data data; | 561 | struct perf_sample_data data; |
578 | struct pt_regs regs; | 562 | struct pt_regs regs; |
@@ -603,6 +587,9 @@ static void __intel_pmu_pebs_event(struct perf_event *event, | |||
603 | else | 587 | else |
604 | regs.flags &= ~PERF_EFLAGS_EXACT; | 588 | regs.flags &= ~PERF_EFLAGS_EXACT; |
605 | 589 | ||
590 | if (has_branch_stack(event)) | ||
591 | data.br_stack = &cpuc->lbr_stack; | ||
592 | |||
606 | if (perf_event_overflow(event, &data, ®s)) | 593 | if (perf_event_overflow(event, &data, ®s)) |
607 | x86_pmu_stop(event, 0); | 594 | x86_pmu_stop(event, 0); |
608 | } | 595 | } |
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 3fab3de3ce96..520b4265fcd2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c | |||
@@ -3,6 +3,7 @@ | |||
3 | 3 | ||
4 | #include <asm/perf_event.h> | 4 | #include <asm/perf_event.h> |
5 | #include <asm/msr.h> | 5 | #include <asm/msr.h> |
6 | #include <asm/insn.h> | ||
6 | 7 | ||
7 | #include "perf_event.h" | 8 | #include "perf_event.h" |
8 | 9 | ||
@@ -14,6 +15,100 @@ enum { | |||
14 | }; | 15 | }; |
15 | 16 | ||
16 | /* | 17 | /* |
18 | * Intel LBR_SELECT bits | ||
19 | * Intel Vol3a, April 2011, Section 16.7 Table 16-10 | ||
20 | * | ||
21 | * Hardware branch filter (not available on all CPUs) | ||
22 | */ | ||
23 | #define LBR_KERNEL_BIT 0 /* do not capture at ring0 */ | ||
24 | #define LBR_USER_BIT 1 /* do not capture at ring > 0 */ | ||
25 | #define LBR_JCC_BIT 2 /* do not capture conditional branches */ | ||
26 | #define LBR_REL_CALL_BIT 3 /* do not capture relative calls */ | ||
27 | #define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */ | ||
28 | #define LBR_RETURN_BIT 5 /* do not capture near returns */ | ||
29 | #define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */ | ||
30 | #define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */ | ||
31 | #define LBR_FAR_BIT 8 /* do not capture far branches */ | ||
32 | |||
33 | #define LBR_KERNEL (1 << LBR_KERNEL_BIT) | ||
34 | #define LBR_USER (1 << LBR_USER_BIT) | ||
35 | #define LBR_JCC (1 << LBR_JCC_BIT) | ||
36 | #define LBR_REL_CALL (1 << LBR_REL_CALL_BIT) | ||
37 | #define LBR_IND_CALL (1 << LBR_IND_CALL_BIT) | ||
38 | #define LBR_RETURN (1 << LBR_RETURN_BIT) | ||
39 | #define LBR_REL_JMP (1 << LBR_REL_JMP_BIT) | ||
40 | #define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) | ||
41 | #define LBR_FAR (1 << LBR_FAR_BIT) | ||
42 | |||
43 | #define LBR_PLM (LBR_KERNEL | LBR_USER) | ||
44 | |||
45 | #define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */ | ||
46 | #define LBR_NOT_SUPP -1 /* LBR filter not supported */ | ||
47 | #define LBR_IGN 0 /* ignored */ | ||
48 | |||
49 | #define LBR_ANY \ | ||
50 | (LBR_JCC |\ | ||
51 | LBR_REL_CALL |\ | ||
52 | LBR_IND_CALL |\ | ||
53 | LBR_RETURN |\ | ||
54 | LBR_REL_JMP |\ | ||
55 | LBR_IND_JMP |\ | ||
56 | LBR_FAR) | ||
57 | |||
58 | #define LBR_FROM_FLAG_MISPRED (1ULL << 63) | ||
59 | |||
60 | #define for_each_branch_sample_type(x) \ | ||
61 | for ((x) = PERF_SAMPLE_BRANCH_USER; \ | ||
62 | (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1) | ||
63 | |||
64 | /* | ||
65 | * x86control flow change classification | ||
66 | * x86control flow changes include branches, interrupts, traps, faults | ||
67 | */ | ||
68 | enum { | ||
69 | X86_BR_NONE = 0, /* unknown */ | ||
70 | |||
71 | X86_BR_USER = 1 << 0, /* branch target is user */ | ||
72 | X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ | ||
73 | |||
74 | X86_BR_CALL = 1 << 2, /* call */ | ||
75 | X86_BR_RET = 1 << 3, /* return */ | ||
76 | X86_BR_SYSCALL = 1 << 4, /* syscall */ | ||
77 | X86_BR_SYSRET = 1 << 5, /* syscall return */ | ||
78 | X86_BR_INT = 1 << 6, /* sw interrupt */ | ||
79 | X86_BR_IRET = 1 << 7, /* return from interrupt */ | ||
80 | X86_BR_JCC = 1 << 8, /* conditional */ | ||
81 | X86_BR_JMP = 1 << 9, /* jump */ | ||
82 | X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ | ||
83 | X86_BR_IND_CALL = 1 << 11,/* indirect calls */ | ||
84 | }; | ||
85 | |||
86 | #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) | ||
87 | |||
88 | #define X86_BR_ANY \ | ||
89 | (X86_BR_CALL |\ | ||
90 | X86_BR_RET |\ | ||
91 | X86_BR_SYSCALL |\ | ||
92 | X86_BR_SYSRET |\ | ||
93 | X86_BR_INT |\ | ||
94 | X86_BR_IRET |\ | ||
95 | X86_BR_JCC |\ | ||
96 | X86_BR_JMP |\ | ||
97 | X86_BR_IRQ |\ | ||
98 | X86_BR_IND_CALL) | ||
99 | |||
100 | #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) | ||
101 | |||
102 | #define X86_BR_ANY_CALL \ | ||
103 | (X86_BR_CALL |\ | ||
104 | X86_BR_IND_CALL |\ | ||
105 | X86_BR_SYSCALL |\ | ||
106 | X86_BR_IRQ |\ | ||
107 | X86_BR_INT) | ||
108 | |||
109 | static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); | ||
110 | |||
111 | /* | ||
17 | * We only support LBR implementations that have FREEZE_LBRS_ON_PMI | 112 | * We only support LBR implementations that have FREEZE_LBRS_ON_PMI |
18 | * otherwise it becomes near impossible to get a reliable stack. | 113 | * otherwise it becomes near impossible to get a reliable stack. |
19 | */ | 114 | */ |
@@ -21,6 +116,10 @@ enum { | |||
21 | static void __intel_pmu_lbr_enable(void) | 116 | static void __intel_pmu_lbr_enable(void) |
22 | { | 117 | { |
23 | u64 debugctl; | 118 | u64 debugctl; |
119 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
120 | |||
121 | if (cpuc->lbr_sel) | ||
122 | wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config); | ||
24 | 123 | ||
25 | rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); | 124 | rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); |
26 | debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); | 125 | debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); |
@@ -72,17 +171,15 @@ void intel_pmu_lbr_enable(struct perf_event *event) | |||
72 | if (!x86_pmu.lbr_nr) | 171 | if (!x86_pmu.lbr_nr) |
73 | return; | 172 | return; |
74 | 173 | ||
75 | WARN_ON_ONCE(cpuc->enabled); | ||
76 | |||
77 | /* | 174 | /* |
78 | * Reset the LBR stack if we changed task context to | 175 | * Reset the LBR stack if we changed task context to |
79 | * avoid data leaks. | 176 | * avoid data leaks. |
80 | */ | 177 | */ |
81 | |||
82 | if (event->ctx->task && cpuc->lbr_context != event->ctx) { | 178 | if (event->ctx->task && cpuc->lbr_context != event->ctx) { |
83 | intel_pmu_lbr_reset(); | 179 | intel_pmu_lbr_reset(); |
84 | cpuc->lbr_context = event->ctx; | 180 | cpuc->lbr_context = event->ctx; |
85 | } | 181 | } |
182 | cpuc->br_sel = event->hw.branch_reg.reg; | ||
86 | 183 | ||
87 | cpuc->lbr_users++; | 184 | cpuc->lbr_users++; |
88 | } | 185 | } |
@@ -97,8 +194,11 @@ void intel_pmu_lbr_disable(struct perf_event *event) | |||
97 | cpuc->lbr_users--; | 194 | cpuc->lbr_users--; |
98 | WARN_ON_ONCE(cpuc->lbr_users < 0); | 195 | WARN_ON_ONCE(cpuc->lbr_users < 0); |
99 | 196 | ||
100 | if (cpuc->enabled && !cpuc->lbr_users) | 197 | if (cpuc->enabled && !cpuc->lbr_users) { |
101 | __intel_pmu_lbr_disable(); | 198 | __intel_pmu_lbr_disable(); |
199 | /* avoid stale pointer */ | ||
200 | cpuc->lbr_context = NULL; | ||
201 | } | ||
102 | } | 202 | } |
103 | 203 | ||
104 | void intel_pmu_lbr_enable_all(void) | 204 | void intel_pmu_lbr_enable_all(void) |
@@ -117,6 +217,9 @@ void intel_pmu_lbr_disable_all(void) | |||
117 | __intel_pmu_lbr_disable(); | 217 | __intel_pmu_lbr_disable(); |
118 | } | 218 | } |
119 | 219 | ||
220 | /* | ||
221 | * TOS = most recently recorded branch | ||
222 | */ | ||
120 | static inline u64 intel_pmu_lbr_tos(void) | 223 | static inline u64 intel_pmu_lbr_tos(void) |
121 | { | 224 | { |
122 | u64 tos; | 225 | u64 tos; |
@@ -144,15 +247,15 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) | |||
144 | 247 | ||
145 | rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); | 248 | rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); |
146 | 249 | ||
147 | cpuc->lbr_entries[i].from = msr_lastbranch.from; | 250 | cpuc->lbr_entries[i].from = msr_lastbranch.from; |
148 | cpuc->lbr_entries[i].to = msr_lastbranch.to; | 251 | cpuc->lbr_entries[i].to = msr_lastbranch.to; |
149 | cpuc->lbr_entries[i].flags = 0; | 252 | cpuc->lbr_entries[i].mispred = 0; |
253 | cpuc->lbr_entries[i].predicted = 0; | ||
254 | cpuc->lbr_entries[i].reserved = 0; | ||
150 | } | 255 | } |
151 | cpuc->lbr_stack.nr = i; | 256 | cpuc->lbr_stack.nr = i; |
152 | } | 257 | } |
153 | 258 | ||
154 | #define LBR_FROM_FLAG_MISPRED (1ULL << 63) | ||
155 | |||
156 | /* | 259 | /* |
157 | * Due to lack of segmentation in Linux the effective address (offset) | 260 | * Due to lack of segmentation in Linux the effective address (offset) |
158 | * is the same as the linear address, allowing us to merge the LIP and EIP | 261 | * is the same as the linear address, allowing us to merge the LIP and EIP |
@@ -167,19 +270,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) | |||
167 | 270 | ||
168 | for (i = 0; i < x86_pmu.lbr_nr; i++) { | 271 | for (i = 0; i < x86_pmu.lbr_nr; i++) { |
169 | unsigned long lbr_idx = (tos - i) & mask; | 272 | unsigned long lbr_idx = (tos - i) & mask; |
170 | u64 from, to, flags = 0; | 273 | u64 from, to, mis = 0, pred = 0; |
171 | 274 | ||
172 | rdmsrl(x86_pmu.lbr_from + lbr_idx, from); | 275 | rdmsrl(x86_pmu.lbr_from + lbr_idx, from); |
173 | rdmsrl(x86_pmu.lbr_to + lbr_idx, to); | 276 | rdmsrl(x86_pmu.lbr_to + lbr_idx, to); |
174 | 277 | ||
175 | if (lbr_format == LBR_FORMAT_EIP_FLAGS) { | 278 | if (lbr_format == LBR_FORMAT_EIP_FLAGS) { |
176 | flags = !!(from & LBR_FROM_FLAG_MISPRED); | 279 | mis = !!(from & LBR_FROM_FLAG_MISPRED); |
280 | pred = !mis; | ||
177 | from = (u64)((((s64)from) << 1) >> 1); | 281 | from = (u64)((((s64)from) << 1) >> 1); |
178 | } | 282 | } |
179 | 283 | ||
180 | cpuc->lbr_entries[i].from = from; | 284 | cpuc->lbr_entries[i].from = from; |
181 | cpuc->lbr_entries[i].to = to; | 285 | cpuc->lbr_entries[i].to = to; |
182 | cpuc->lbr_entries[i].flags = flags; | 286 | cpuc->lbr_entries[i].mispred = mis; |
287 | cpuc->lbr_entries[i].predicted = pred; | ||
288 | cpuc->lbr_entries[i].reserved = 0; | ||
183 | } | 289 | } |
184 | cpuc->lbr_stack.nr = i; | 290 | cpuc->lbr_stack.nr = i; |
185 | } | 291 | } |
@@ -195,28 +301,404 @@ void intel_pmu_lbr_read(void) | |||
195 | intel_pmu_lbr_read_32(cpuc); | 301 | intel_pmu_lbr_read_32(cpuc); |
196 | else | 302 | else |
197 | intel_pmu_lbr_read_64(cpuc); | 303 | intel_pmu_lbr_read_64(cpuc); |
304 | |||
305 | intel_pmu_lbr_filter(cpuc); | ||
306 | } | ||
307 | |||
308 | /* | ||
309 | * SW filter is used: | ||
310 | * - in case there is no HW filter | ||
311 | * - in case the HW filter has errata or limitations | ||
312 | */ | ||
313 | static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) | ||
314 | { | ||
315 | u64 br_type = event->attr.branch_sample_type; | ||
316 | int mask = 0; | ||
317 | |||
318 | if (br_type & PERF_SAMPLE_BRANCH_USER) | ||
319 | mask |= X86_BR_USER; | ||
320 | |||
321 | if (br_type & PERF_SAMPLE_BRANCH_KERNEL) | ||
322 | mask |= X86_BR_KERNEL; | ||
323 | |||
324 | /* we ignore BRANCH_HV here */ | ||
325 | |||
326 | if (br_type & PERF_SAMPLE_BRANCH_ANY) | ||
327 | mask |= X86_BR_ANY; | ||
328 | |||
329 | if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) | ||
330 | mask |= X86_BR_ANY_CALL; | ||
331 | |||
332 | if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) | ||
333 | mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; | ||
334 | |||
335 | if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) | ||
336 | mask |= X86_BR_IND_CALL; | ||
337 | /* | ||
338 | * stash actual user request into reg, it may | ||
339 | * be used by fixup code for some CPU | ||
340 | */ | ||
341 | event->hw.branch_reg.reg = mask; | ||
342 | } | ||
343 | |||
344 | /* | ||
345 | * setup the HW LBR filter | ||
346 | * Used only when available, may not be enough to disambiguate | ||
347 | * all branches, may need the help of the SW filter | ||
348 | */ | ||
349 | static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) | ||
350 | { | ||
351 | struct hw_perf_event_extra *reg; | ||
352 | u64 br_type = event->attr.branch_sample_type; | ||
353 | u64 mask = 0, m; | ||
354 | u64 v; | ||
355 | |||
356 | for_each_branch_sample_type(m) { | ||
357 | if (!(br_type & m)) | ||
358 | continue; | ||
359 | |||
360 | v = x86_pmu.lbr_sel_map[m]; | ||
361 | if (v == LBR_NOT_SUPP) | ||
362 | return -EOPNOTSUPP; | ||
363 | |||
364 | if (v != LBR_IGN) | ||
365 | mask |= v; | ||
366 | } | ||
367 | reg = &event->hw.branch_reg; | ||
368 | reg->idx = EXTRA_REG_LBR; | ||
369 | |||
370 | /* LBR_SELECT operates in suppress mode so invert mask */ | ||
371 | reg->config = ~mask & x86_pmu.lbr_sel_mask; | ||
372 | |||
373 | return 0; | ||
374 | } | ||
375 | |||
376 | int intel_pmu_setup_lbr_filter(struct perf_event *event) | ||
377 | { | ||
378 | int ret = 0; | ||
379 | |||
380 | /* | ||
381 | * no LBR on this PMU | ||
382 | */ | ||
383 | if (!x86_pmu.lbr_nr) | ||
384 | return -EOPNOTSUPP; | ||
385 | |||
386 | /* | ||
387 | * setup SW LBR filter | ||
388 | */ | ||
389 | intel_pmu_setup_sw_lbr_filter(event); | ||
390 | |||
391 | /* | ||
392 | * setup HW LBR filter, if any | ||
393 | */ | ||
394 | if (x86_pmu.lbr_sel_map) | ||
395 | ret = intel_pmu_setup_hw_lbr_filter(event); | ||
396 | |||
397 | return ret; | ||
398 | } | ||
399 | |||
400 | /* | ||
401 | * return the type of control flow change at address "from" | ||
402 | * intruction is not necessarily a branch (in case of interrupt). | ||
403 | * | ||
404 | * The branch type returned also includes the priv level of the | ||
405 | * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). | ||
406 | * | ||
407 | * If a branch type is unknown OR the instruction cannot be | ||
408 | * decoded (e.g., text page not present), then X86_BR_NONE is | ||
409 | * returned. | ||
410 | */ | ||
411 | static int branch_type(unsigned long from, unsigned long to) | ||
412 | { | ||
413 | struct insn insn; | ||
414 | void *addr; | ||
415 | int bytes, size = MAX_INSN_SIZE; | ||
416 | int ret = X86_BR_NONE; | ||
417 | int ext, to_plm, from_plm; | ||
418 | u8 buf[MAX_INSN_SIZE]; | ||
419 | int is64 = 0; | ||
420 | |||
421 | to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; | ||
422 | from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; | ||
423 | |||
424 | /* | ||
425 | * maybe zero if lbr did not fill up after a reset by the time | ||
426 | * we get a PMU interrupt | ||
427 | */ | ||
428 | if (from == 0 || to == 0) | ||
429 | return X86_BR_NONE; | ||
430 | |||
431 | if (from_plm == X86_BR_USER) { | ||
432 | /* | ||
433 | * can happen if measuring at the user level only | ||
434 | * and we interrupt in a kernel thread, e.g., idle. | ||
435 | */ | ||
436 | if (!current->mm) | ||
437 | return X86_BR_NONE; | ||
438 | |||
439 | /* may fail if text not present */ | ||
440 | bytes = copy_from_user_nmi(buf, (void __user *)from, size); | ||
441 | if (bytes != size) | ||
442 | return X86_BR_NONE; | ||
443 | |||
444 | addr = buf; | ||
445 | } else | ||
446 | addr = (void *)from; | ||
447 | |||
448 | /* | ||
449 | * decoder needs to know the ABI especially | ||
450 | * on 64-bit systems running 32-bit apps | ||
451 | */ | ||
452 | #ifdef CONFIG_X86_64 | ||
453 | is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); | ||
454 | #endif | ||
455 | insn_init(&insn, addr, is64); | ||
456 | insn_get_opcode(&insn); | ||
457 | |||
458 | switch (insn.opcode.bytes[0]) { | ||
459 | case 0xf: | ||
460 | switch (insn.opcode.bytes[1]) { | ||
461 | case 0x05: /* syscall */ | ||
462 | case 0x34: /* sysenter */ | ||
463 | ret = X86_BR_SYSCALL; | ||
464 | break; | ||
465 | case 0x07: /* sysret */ | ||
466 | case 0x35: /* sysexit */ | ||
467 | ret = X86_BR_SYSRET; | ||
468 | break; | ||
469 | case 0x80 ... 0x8f: /* conditional */ | ||
470 | ret = X86_BR_JCC; | ||
471 | break; | ||
472 | default: | ||
473 | ret = X86_BR_NONE; | ||
474 | } | ||
475 | break; | ||
476 | case 0x70 ... 0x7f: /* conditional */ | ||
477 | ret = X86_BR_JCC; | ||
478 | break; | ||
479 | case 0xc2: /* near ret */ | ||
480 | case 0xc3: /* near ret */ | ||
481 | case 0xca: /* far ret */ | ||
482 | case 0xcb: /* far ret */ | ||
483 | ret = X86_BR_RET; | ||
484 | break; | ||
485 | case 0xcf: /* iret */ | ||
486 | ret = X86_BR_IRET; | ||
487 | break; | ||
488 | case 0xcc ... 0xce: /* int */ | ||
489 | ret = X86_BR_INT; | ||
490 | break; | ||
491 | case 0xe8: /* call near rel */ | ||
492 | case 0x9a: /* call far absolute */ | ||
493 | ret = X86_BR_CALL; | ||
494 | break; | ||
495 | case 0xe0 ... 0xe3: /* loop jmp */ | ||
496 | ret = X86_BR_JCC; | ||
497 | break; | ||
498 | case 0xe9 ... 0xeb: /* jmp */ | ||
499 | ret = X86_BR_JMP; | ||
500 | break; | ||
501 | case 0xff: /* call near absolute, call far absolute ind */ | ||
502 | insn_get_modrm(&insn); | ||
503 | ext = (insn.modrm.bytes[0] >> 3) & 0x7; | ||
504 | switch (ext) { | ||
505 | case 2: /* near ind call */ | ||
506 | case 3: /* far ind call */ | ||
507 | ret = X86_BR_IND_CALL; | ||
508 | break; | ||
509 | case 4: | ||
510 | case 5: | ||
511 | ret = X86_BR_JMP; | ||
512 | break; | ||
513 | } | ||
514 | break; | ||
515 | default: | ||
516 | ret = X86_BR_NONE; | ||
517 | } | ||
518 | /* | ||
519 | * interrupts, traps, faults (and thus ring transition) may | ||
520 | * occur on any instructions. Thus, to classify them correctly, | ||
521 | * we need to first look at the from and to priv levels. If they | ||
522 | * are different and to is in the kernel, then it indicates | ||
523 | * a ring transition. If the from instruction is not a ring | ||
524 | * transition instr (syscall, systenter, int), then it means | ||
525 | * it was a irq, trap or fault. | ||
526 | * | ||
527 | * we have no way of detecting kernel to kernel faults. | ||
528 | */ | ||
529 | if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL | ||
530 | && ret != X86_BR_SYSCALL && ret != X86_BR_INT) | ||
531 | ret = X86_BR_IRQ; | ||
532 | |||
533 | /* | ||
534 | * branch priv level determined by target as | ||
535 | * is done by HW when LBR_SELECT is implemented | ||
536 | */ | ||
537 | if (ret != X86_BR_NONE) | ||
538 | ret |= to_plm; | ||
539 | |||
540 | return ret; | ||
541 | } | ||
542 | |||
543 | /* | ||
544 | * implement actual branch filter based on user demand. | ||
545 | * Hardware may not exactly satisfy that request, thus | ||
546 | * we need to inspect opcodes. Mismatched branches are | ||
547 | * discarded. Therefore, the number of branches returned | ||
548 | * in PERF_SAMPLE_BRANCH_STACK sample may vary. | ||
549 | */ | ||
550 | static void | ||
551 | intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) | ||
552 | { | ||
553 | u64 from, to; | ||
554 | int br_sel = cpuc->br_sel; | ||
555 | int i, j, type; | ||
556 | bool compress = false; | ||
557 | |||
558 | /* if sampling all branches, then nothing to filter */ | ||
559 | if ((br_sel & X86_BR_ALL) == X86_BR_ALL) | ||
560 | return; | ||
561 | |||
562 | for (i = 0; i < cpuc->lbr_stack.nr; i++) { | ||
563 | |||
564 | from = cpuc->lbr_entries[i].from; | ||
565 | to = cpuc->lbr_entries[i].to; | ||
566 | |||
567 | type = branch_type(from, to); | ||
568 | |||
569 | /* if type does not correspond, then discard */ | ||
570 | if (type == X86_BR_NONE || (br_sel & type) != type) { | ||
571 | cpuc->lbr_entries[i].from = 0; | ||
572 | compress = true; | ||
573 | } | ||
574 | } | ||
575 | |||
576 | if (!compress) | ||
577 | return; | ||
578 | |||
579 | /* remove all entries with from=0 */ | ||
580 | for (i = 0; i < cpuc->lbr_stack.nr; ) { | ||
581 | if (!cpuc->lbr_entries[i].from) { | ||
582 | j = i; | ||
583 | while (++j < cpuc->lbr_stack.nr) | ||
584 | cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j]; | ||
585 | cpuc->lbr_stack.nr--; | ||
586 | if (!cpuc->lbr_entries[i].from) | ||
587 | continue; | ||
588 | } | ||
589 | i++; | ||
590 | } | ||
198 | } | 591 | } |
199 | 592 | ||
593 | /* | ||
594 | * Map interface branch filters onto LBR filters | ||
595 | */ | ||
596 | static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { | ||
597 | [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, | ||
598 | [PERF_SAMPLE_BRANCH_USER] = LBR_USER, | ||
599 | [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, | ||
600 | [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, | ||
601 | [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP | ||
602 | | LBR_IND_JMP | LBR_FAR, | ||
603 | /* | ||
604 | * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches | ||
605 | */ | ||
606 | [PERF_SAMPLE_BRANCH_ANY_CALL] = | ||
607 | LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, | ||
608 | /* | ||
609 | * NHM/WSM erratum: must include IND_JMP to capture IND_CALL | ||
610 | */ | ||
611 | [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, | ||
612 | }; | ||
613 | |||
614 | static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { | ||
615 | [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, | ||
616 | [PERF_SAMPLE_BRANCH_USER] = LBR_USER, | ||
617 | [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, | ||
618 | [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, | ||
619 | [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR, | ||
620 | [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL | ||
621 | | LBR_FAR, | ||
622 | [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL, | ||
623 | }; | ||
624 | |||
625 | /* core */ | ||
200 | void intel_pmu_lbr_init_core(void) | 626 | void intel_pmu_lbr_init_core(void) |
201 | { | 627 | { |
202 | x86_pmu.lbr_nr = 4; | 628 | x86_pmu.lbr_nr = 4; |
203 | x86_pmu.lbr_tos = 0x01c9; | 629 | x86_pmu.lbr_tos = MSR_LBR_TOS; |
204 | x86_pmu.lbr_from = 0x40; | 630 | x86_pmu.lbr_from = MSR_LBR_CORE_FROM; |
205 | x86_pmu.lbr_to = 0x60; | 631 | x86_pmu.lbr_to = MSR_LBR_CORE_TO; |
632 | |||
633 | /* | ||
634 | * SW branch filter usage: | ||
635 | * - compensate for lack of HW filter | ||
636 | */ | ||
637 | pr_cont("4-deep LBR, "); | ||
206 | } | 638 | } |
207 | 639 | ||
640 | /* nehalem/westmere */ | ||
208 | void intel_pmu_lbr_init_nhm(void) | 641 | void intel_pmu_lbr_init_nhm(void) |
209 | { | 642 | { |
210 | x86_pmu.lbr_nr = 16; | 643 | x86_pmu.lbr_nr = 16; |
211 | x86_pmu.lbr_tos = 0x01c9; | 644 | x86_pmu.lbr_tos = MSR_LBR_TOS; |
212 | x86_pmu.lbr_from = 0x680; | 645 | x86_pmu.lbr_from = MSR_LBR_NHM_FROM; |
213 | x86_pmu.lbr_to = 0x6c0; | 646 | x86_pmu.lbr_to = MSR_LBR_NHM_TO; |
647 | |||
648 | x86_pmu.lbr_sel_mask = LBR_SEL_MASK; | ||
649 | x86_pmu.lbr_sel_map = nhm_lbr_sel_map; | ||
650 | |||
651 | /* | ||
652 | * SW branch filter usage: | ||
653 | * - workaround LBR_SEL errata (see above) | ||
654 | * - support syscall, sysret capture. | ||
655 | * That requires LBR_FAR but that means far | ||
656 | * jmp need to be filtered out | ||
657 | */ | ||
658 | pr_cont("16-deep LBR, "); | ||
659 | } | ||
660 | |||
661 | /* sandy bridge */ | ||
662 | void intel_pmu_lbr_init_snb(void) | ||
663 | { | ||
664 | x86_pmu.lbr_nr = 16; | ||
665 | x86_pmu.lbr_tos = MSR_LBR_TOS; | ||
666 | x86_pmu.lbr_from = MSR_LBR_NHM_FROM; | ||
667 | x86_pmu.lbr_to = MSR_LBR_NHM_TO; | ||
668 | |||
669 | x86_pmu.lbr_sel_mask = LBR_SEL_MASK; | ||
670 | x86_pmu.lbr_sel_map = snb_lbr_sel_map; | ||
671 | |||
672 | /* | ||
673 | * SW branch filter usage: | ||
674 | * - support syscall, sysret capture. | ||
675 | * That requires LBR_FAR but that means far | ||
676 | * jmp need to be filtered out | ||
677 | */ | ||
678 | pr_cont("16-deep LBR, "); | ||
214 | } | 679 | } |
215 | 680 | ||
681 | /* atom */ | ||
216 | void intel_pmu_lbr_init_atom(void) | 682 | void intel_pmu_lbr_init_atom(void) |
217 | { | 683 | { |
684 | /* | ||
685 | * only models starting at stepping 10 seems | ||
686 | * to have an operational LBR which can freeze | ||
687 | * on PMU interrupt | ||
688 | */ | ||
689 | if (boot_cpu_data.x86_mask < 10) { | ||
690 | pr_cont("LBR disabled due to erratum"); | ||
691 | return; | ||
692 | } | ||
693 | |||
218 | x86_pmu.lbr_nr = 8; | 694 | x86_pmu.lbr_nr = 8; |
219 | x86_pmu.lbr_tos = 0x01c9; | 695 | x86_pmu.lbr_tos = MSR_LBR_TOS; |
220 | x86_pmu.lbr_from = 0x40; | 696 | x86_pmu.lbr_from = MSR_LBR_CORE_FROM; |
221 | x86_pmu.lbr_to = 0x60; | 697 | x86_pmu.lbr_to = MSR_LBR_CORE_TO; |
698 | |||
699 | /* | ||
700 | * SW branch filter usage: | ||
701 | * - compensate for lack of HW filter | ||
702 | */ | ||
703 | pr_cont("8-deep LBR, "); | ||
222 | } | 704 | } |
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index c7f64e6f537a..addf9e82a7f2 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c | |||
@@ -40,6 +40,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) | |||
40 | { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, | 40 | { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, |
41 | { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, | 41 | { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, |
42 | { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, | 42 | { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, |
43 | { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, | ||
43 | { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, | 44 | { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, |
44 | { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, | 45 | { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, |
45 | { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, | 46 | { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, |
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 642f75a68cd5..11891ca7b716 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c | |||
@@ -62,16 +62,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | |||
62 | 62 | ||
63 | if (!userbuf) { | 63 | if (!userbuf) { |
64 | memcpy(buf, (vaddr + offset), csize); | 64 | memcpy(buf, (vaddr + offset), csize); |
65 | kunmap_atomic(vaddr, KM_PTE0); | 65 | kunmap_atomic(vaddr); |
66 | } else { | 66 | } else { |
67 | if (!kdump_buf_page) { | 67 | if (!kdump_buf_page) { |
68 | printk(KERN_WARNING "Kdump: Kdump buffer page not" | 68 | printk(KERN_WARNING "Kdump: Kdump buffer page not" |
69 | " allocated\n"); | 69 | " allocated\n"); |
70 | kunmap_atomic(vaddr, KM_PTE0); | 70 | kunmap_atomic(vaddr); |
71 | return -EFAULT; | 71 | return -EFAULT; |
72 | } | 72 | } |
73 | copy_page(kdump_buf_page, vaddr); | 73 | copy_page(kdump_buf_page, vaddr); |
74 | kunmap_atomic(vaddr, KM_PTE0); | 74 | kunmap_atomic(vaddr); |
75 | if (copy_to_user(buf, (kdump_buf_page + offset), csize)) | 75 | if (copy_to_user(buf, (kdump_buf_page + offset), csize)) |
76 | return -EFAULT; | 76 | return -EFAULT; |
77 | } | 77 | } |
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 52821799a702..3ae2ced4a874 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/bootmem.h> | 4 | #include <linux/bootmem.h> |
5 | #include <linux/export.h> | 5 | #include <linux/export.h> |
6 | #include <linux/io.h> | 6 | #include <linux/io.h> |
7 | #include <linux/irqdomain.h> | ||
7 | #include <linux/interrupt.h> | 8 | #include <linux/interrupt.h> |
8 | #include <linux/list.h> | 9 | #include <linux/list.h> |
9 | #include <linux/of.h> | 10 | #include <linux/of.h> |
@@ -17,64 +18,14 @@ | |||
17 | #include <linux/initrd.h> | 18 | #include <linux/initrd.h> |
18 | 19 | ||
19 | #include <asm/hpet.h> | 20 | #include <asm/hpet.h> |
20 | #include <asm/irq_controller.h> | ||
21 | #include <asm/apic.h> | 21 | #include <asm/apic.h> |
22 | #include <asm/pci_x86.h> | 22 | #include <asm/pci_x86.h> |
23 | 23 | ||
24 | __initdata u64 initial_dtb; | 24 | __initdata u64 initial_dtb; |
25 | char __initdata cmd_line[COMMAND_LINE_SIZE]; | 25 | char __initdata cmd_line[COMMAND_LINE_SIZE]; |
26 | static LIST_HEAD(irq_domains); | ||
27 | static DEFINE_RAW_SPINLOCK(big_irq_lock); | ||
28 | 26 | ||
29 | int __initdata of_ioapic; | 27 | int __initdata of_ioapic; |
30 | 28 | ||
31 | #ifdef CONFIG_X86_IO_APIC | ||
32 | static void add_interrupt_host(struct irq_domain *ih) | ||
33 | { | ||
34 | unsigned long flags; | ||
35 | |||
36 | raw_spin_lock_irqsave(&big_irq_lock, flags); | ||
37 | list_add(&ih->l, &irq_domains); | ||
38 | raw_spin_unlock_irqrestore(&big_irq_lock, flags); | ||
39 | } | ||
40 | #endif | ||
41 | |||
42 | static struct irq_domain *get_ih_from_node(struct device_node *controller) | ||
43 | { | ||
44 | struct irq_domain *ih, *found = NULL; | ||
45 | unsigned long flags; | ||
46 | |||
47 | raw_spin_lock_irqsave(&big_irq_lock, flags); | ||
48 | list_for_each_entry(ih, &irq_domains, l) { | ||
49 | if (ih->controller == controller) { | ||
50 | found = ih; | ||
51 | break; | ||
52 | } | ||
53 | } | ||
54 | raw_spin_unlock_irqrestore(&big_irq_lock, flags); | ||
55 | return found; | ||
56 | } | ||
57 | |||
58 | unsigned int irq_create_of_mapping(struct device_node *controller, | ||
59 | const u32 *intspec, unsigned int intsize) | ||
60 | { | ||
61 | struct irq_domain *ih; | ||
62 | u32 virq, type; | ||
63 | int ret; | ||
64 | |||
65 | ih = get_ih_from_node(controller); | ||
66 | if (!ih) | ||
67 | return 0; | ||
68 | ret = ih->xlate(ih, intspec, intsize, &virq, &type); | ||
69 | if (ret) | ||
70 | return 0; | ||
71 | if (type == IRQ_TYPE_NONE) | ||
72 | return virq; | ||
73 | irq_set_irq_type(virq, type); | ||
74 | return virq; | ||
75 | } | ||
76 | EXPORT_SYMBOL_GPL(irq_create_of_mapping); | ||
77 | |||
78 | unsigned long pci_address_to_pio(phys_addr_t address) | 29 | unsigned long pci_address_to_pio(phys_addr_t address) |
79 | { | 30 | { |
80 | /* | 31 | /* |
@@ -354,36 +305,43 @@ static struct of_ioapic_type of_ioapic_type[] = | |||
354 | }, | 305 | }, |
355 | }; | 306 | }; |
356 | 307 | ||
357 | static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, | 308 | static int ioapic_xlate(struct irq_domain *domain, |
358 | u32 *out_hwirq, u32 *out_type) | 309 | struct device_node *controller, |
310 | const u32 *intspec, u32 intsize, | ||
311 | irq_hw_number_t *out_hwirq, u32 *out_type) | ||
359 | { | 312 | { |
360 | struct mp_ioapic_gsi *gsi_cfg; | ||
361 | struct io_apic_irq_attr attr; | 313 | struct io_apic_irq_attr attr; |
362 | struct of_ioapic_type *it; | 314 | struct of_ioapic_type *it; |
363 | u32 line, idx, type; | 315 | u32 line, idx; |
316 | int rc; | ||
364 | 317 | ||
365 | if (intsize < 2) | 318 | if (WARN_ON(intsize < 2)) |
366 | return -EINVAL; | 319 | return -EINVAL; |
367 | 320 | ||
368 | line = *intspec; | 321 | line = intspec[0]; |
369 | idx = (u32) id->priv; | ||
370 | gsi_cfg = mp_ioapic_gsi_routing(idx); | ||
371 | *out_hwirq = line + gsi_cfg->gsi_base; | ||
372 | |||
373 | intspec++; | ||
374 | type = *intspec; | ||
375 | 322 | ||
376 | if (type >= ARRAY_SIZE(of_ioapic_type)) | 323 | if (intspec[1] >= ARRAY_SIZE(of_ioapic_type)) |
377 | return -EINVAL; | 324 | return -EINVAL; |
378 | 325 | ||
379 | it = of_ioapic_type + type; | 326 | it = &of_ioapic_type[intspec[1]]; |
380 | *out_type = it->out_type; | ||
381 | 327 | ||
328 | idx = (u32) domain->host_data; | ||
382 | set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); | 329 | set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); |
383 | 330 | ||
384 | return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr); | 331 | rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line), |
332 | cpu_to_node(0), &attr); | ||
333 | if (rc) | ||
334 | return rc; | ||
335 | |||
336 | *out_hwirq = line; | ||
337 | *out_type = it->out_type; | ||
338 | return 0; | ||
385 | } | 339 | } |
386 | 340 | ||
341 | const struct irq_domain_ops ioapic_irq_domain_ops = { | ||
342 | .xlate = ioapic_xlate, | ||
343 | }; | ||
344 | |||
387 | static void __init ioapic_add_ofnode(struct device_node *np) | 345 | static void __init ioapic_add_ofnode(struct device_node *np) |
388 | { | 346 | { |
389 | struct resource r; | 347 | struct resource r; |
@@ -399,13 +357,14 @@ static void __init ioapic_add_ofnode(struct device_node *np) | |||
399 | for (i = 0; i < nr_ioapics; i++) { | 357 | for (i = 0; i < nr_ioapics; i++) { |
400 | if (r.start == mpc_ioapic_addr(i)) { | 358 | if (r.start == mpc_ioapic_addr(i)) { |
401 | struct irq_domain *id; | 359 | struct irq_domain *id; |
360 | struct mp_ioapic_gsi *gsi_cfg; | ||
361 | |||
362 | gsi_cfg = mp_ioapic_gsi_routing(i); | ||
402 | 363 | ||
403 | id = kzalloc(sizeof(*id), GFP_KERNEL); | 364 | id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0, |
365 | &ioapic_irq_domain_ops, | ||
366 | (void*)i); | ||
404 | BUG_ON(!id); | 367 | BUG_ON(!id); |
405 | id->controller = np; | ||
406 | id->xlate = ioapic_xlate; | ||
407 | id->priv = (void *)i; | ||
408 | add_interrupt_host(id); | ||
409 | return; | 368 | return; |
410 | } | 369 | } |
411 | } | 370 | } |
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 1aae78f775fc..4025fe4f928f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c | |||
@@ -252,7 +252,8 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) | |||
252 | unsigned short ss; | 252 | unsigned short ss; |
253 | unsigned long sp; | 253 | unsigned long sp; |
254 | #endif | 254 | #endif |
255 | printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); | 255 | printk(KERN_DEFAULT |
256 | "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); | ||
256 | #ifdef CONFIG_PREEMPT | 257 | #ifdef CONFIG_PREEMPT |
257 | printk("PREEMPT "); | 258 | printk("PREEMPT "); |
258 | #endif | 259 | #endif |
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index c99f9ed013d5..88ec9129271d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c | |||
@@ -87,7 +87,7 @@ void show_registers(struct pt_regs *regs) | |||
87 | int i; | 87 | int i; |
88 | 88 | ||
89 | print_modules(); | 89 | print_modules(); |
90 | __show_regs(regs, 0); | 90 | __show_regs(regs, !user_mode_vm(regs)); |
91 | 91 | ||
92 | printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", | 92 | printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", |
93 | TASK_COMM_LEN, current->comm, task_pid_nr(current), | 93 | TASK_COMM_LEN, current->comm, task_pid_nr(current), |
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6d728d9284bd..17107bd6e1f0 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c | |||
@@ -129,7 +129,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, | |||
129 | if (!stack) { | 129 | if (!stack) { |
130 | if (regs) | 130 | if (regs) |
131 | stack = (unsigned long *)regs->sp; | 131 | stack = (unsigned long *)regs->sp; |
132 | else if (task && task != current) | 132 | else if (task != current) |
133 | stack = (unsigned long *)task->thread.sp; | 133 | stack = (unsigned long *)task->thread.sp; |
134 | else | 134 | else |
135 | stack = &dummy; | 135 | stack = &dummy; |
@@ -269,11 +269,11 @@ void show_registers(struct pt_regs *regs) | |||
269 | unsigned char c; | 269 | unsigned char c; |
270 | u8 *ip; | 270 | u8 *ip; |
271 | 271 | ||
272 | printk(KERN_EMERG "Stack:\n"); | 272 | printk(KERN_DEFAULT "Stack:\n"); |
273 | show_stack_log_lvl(NULL, regs, (unsigned long *)sp, | 273 | show_stack_log_lvl(NULL, regs, (unsigned long *)sp, |
274 | 0, KERN_EMERG); | 274 | 0, KERN_DEFAULT); |
275 | 275 | ||
276 | printk(KERN_EMERG "Code: "); | 276 | printk(KERN_DEFAULT "Code: "); |
277 | 277 | ||
278 | ip = (u8 *)regs->ip - code_prologue; | 278 | ip = (u8 *)regs->ip - code_prologue; |
279 | if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { | 279 | if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 79d97e68f042..7b784f4ef1e4 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -98,12 +98,6 @@ | |||
98 | #endif | 98 | #endif |
99 | .endm | 99 | .endm |
100 | 100 | ||
101 | #ifdef CONFIG_VM86 | ||
102 | #define resume_userspace_sig check_userspace | ||
103 | #else | ||
104 | #define resume_userspace_sig resume_userspace | ||
105 | #endif | ||
106 | |||
107 | /* | 101 | /* |
108 | * User gs save/restore | 102 | * User gs save/restore |
109 | * | 103 | * |
@@ -327,10 +321,19 @@ ret_from_exception: | |||
327 | preempt_stop(CLBR_ANY) | 321 | preempt_stop(CLBR_ANY) |
328 | ret_from_intr: | 322 | ret_from_intr: |
329 | GET_THREAD_INFO(%ebp) | 323 | GET_THREAD_INFO(%ebp) |
330 | check_userspace: | 324 | resume_userspace_sig: |
325 | #ifdef CONFIG_VM86 | ||
331 | movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS | 326 | movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS |
332 | movb PT_CS(%esp), %al | 327 | movb PT_CS(%esp), %al |
333 | andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax | 328 | andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax |
329 | #else | ||
330 | /* | ||
331 | * We can be coming here from a syscall done in the kernel space, | ||
332 | * e.g. a failed kernel_execve(). | ||
333 | */ | ||
334 | movl PT_CS(%esp), %eax | ||
335 | andl $SEGMENT_RPL_MASK, %eax | ||
336 | #endif | ||
334 | cmpl $USER_RPL, %eax | 337 | cmpl $USER_RPL, %eax |
335 | jb resume_kernel # not returning to v8086 or userspace | 338 | jb resume_kernel # not returning to v8086 or userspace |
336 | 339 | ||
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3fe8239fd8fb..734ebd1d3caa 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -320,7 +320,7 @@ ENDPROC(native_usergs_sysret64) | |||
320 | movq %rsp, %rsi | 320 | movq %rsp, %rsi |
321 | 321 | ||
322 | leaq -RBP(%rsp),%rdi /* arg1 for handler */ | 322 | leaq -RBP(%rsp),%rdi /* arg1 for handler */ |
323 | testl $3, CS(%rdi) | 323 | testl $3, CS-RBP(%rsi) |
324 | je 1f | 324 | je 1f |
325 | SWAPGS | 325 | SWAPGS |
326 | /* | 326 | /* |
@@ -330,11 +330,10 @@ ENDPROC(native_usergs_sysret64) | |||
330 | * moving irq_enter into assembly, which would be too much work) | 330 | * moving irq_enter into assembly, which would be too much work) |
331 | */ | 331 | */ |
332 | 1: incl PER_CPU_VAR(irq_count) | 332 | 1: incl PER_CPU_VAR(irq_count) |
333 | jne 2f | 333 | cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp |
334 | mov PER_CPU_VAR(irq_stack_ptr),%rsp | ||
335 | CFI_DEF_CFA_REGISTER rsi | 334 | CFI_DEF_CFA_REGISTER rsi |
336 | 335 | ||
337 | 2: /* Store previous stack value */ | 336 | /* Store previous stack value */ |
338 | pushq %rsi | 337 | pushq %rsi |
339 | CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ | 338 | CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ |
340 | 0x77 /* DW_OP_breg7 */, 0, \ | 339 | 0x77 /* DW_OP_breg7 */, 0, \ |
@@ -813,7 +812,7 @@ ret_from_intr: | |||
813 | 812 | ||
814 | /* Restore saved previous stack */ | 813 | /* Restore saved previous stack */ |
815 | popq %rsi | 814 | popq %rsi |
816 | CFI_DEF_CFA_REGISTER rsi | 815 | CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ |
817 | leaq ARGOFFSET-RBP(%rsi), %rsp | 816 | leaq ARGOFFSET-RBP(%rsi), %rsp |
818 | CFI_DEF_CFA_REGISTER rsp | 817 | CFI_DEF_CFA_REGISTER rsp |
819 | CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET | 818 | CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET |
@@ -1530,12 +1529,20 @@ ENTRY(nmi) | |||
1530 | 1529 | ||
1531 | /* Use %rdx as out temp variable throughout */ | 1530 | /* Use %rdx as out temp variable throughout */ |
1532 | pushq_cfi %rdx | 1531 | pushq_cfi %rdx |
1532 | CFI_REL_OFFSET rdx, 0 | ||
1533 | |||
1534 | /* | ||
1535 | * If %cs was not the kernel segment, then the NMI triggered in user | ||
1536 | * space, which means it is definitely not nested. | ||
1537 | */ | ||
1538 | cmpl $__KERNEL_CS, 16(%rsp) | ||
1539 | jne first_nmi | ||
1533 | 1540 | ||
1534 | /* | 1541 | /* |
1535 | * Check the special variable on the stack to see if NMIs are | 1542 | * Check the special variable on the stack to see if NMIs are |
1536 | * executing. | 1543 | * executing. |
1537 | */ | 1544 | */ |
1538 | cmp $1, -8(%rsp) | 1545 | cmpl $1, -8(%rsp) |
1539 | je nested_nmi | 1546 | je nested_nmi |
1540 | 1547 | ||
1541 | /* | 1548 | /* |
@@ -1547,6 +1554,7 @@ ENTRY(nmi) | |||
1547 | */ | 1554 | */ |
1548 | lea 6*8(%rsp), %rdx | 1555 | lea 6*8(%rsp), %rdx |
1549 | test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi | 1556 | test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi |
1557 | CFI_REMEMBER_STATE | ||
1550 | 1558 | ||
1551 | nested_nmi: | 1559 | nested_nmi: |
1552 | /* | 1560 | /* |
@@ -1578,10 +1586,12 @@ nested_nmi: | |||
1578 | 1586 | ||
1579 | nested_nmi_out: | 1587 | nested_nmi_out: |
1580 | popq_cfi %rdx | 1588 | popq_cfi %rdx |
1589 | CFI_RESTORE rdx | ||
1581 | 1590 | ||
1582 | /* No need to check faults here */ | 1591 | /* No need to check faults here */ |
1583 | INTERRUPT_RETURN | 1592 | INTERRUPT_RETURN |
1584 | 1593 | ||
1594 | CFI_RESTORE_STATE | ||
1585 | first_nmi: | 1595 | first_nmi: |
1586 | /* | 1596 | /* |
1587 | * Because nested NMIs will use the pushed location that we | 1597 | * Because nested NMIs will use the pushed location that we |
@@ -1613,10 +1623,15 @@ first_nmi: | |||
1613 | * | pt_regs | | 1623 | * | pt_regs | |
1614 | * +-------------------------+ | 1624 | * +-------------------------+ |
1615 | * | 1625 | * |
1616 | * The saved RIP is used to fix up the copied RIP that a nested | 1626 | * The saved stack frame is used to fix up the copied stack frame |
1617 | * NMI may zero out. The original stack frame and the temp storage | 1627 | * that a nested NMI may change to make the interrupted NMI iret jump |
1628 | * to the repeat_nmi. The original stack frame and the temp storage | ||
1618 | * is also used by nested NMIs and can not be trusted on exit. | 1629 | * is also used by nested NMIs and can not be trusted on exit. |
1619 | */ | 1630 | */ |
1631 | /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ | ||
1632 | movq (%rsp), %rdx | ||
1633 | CFI_RESTORE rdx | ||
1634 | |||
1620 | /* Set the NMI executing variable on the stack. */ | 1635 | /* Set the NMI executing variable on the stack. */ |
1621 | pushq_cfi $1 | 1636 | pushq_cfi $1 |
1622 | 1637 | ||
@@ -1624,22 +1639,39 @@ first_nmi: | |||
1624 | .rept 5 | 1639 | .rept 5 |
1625 | pushq_cfi 6*8(%rsp) | 1640 | pushq_cfi 6*8(%rsp) |
1626 | .endr | 1641 | .endr |
1642 | CFI_DEF_CFA_OFFSET SS+8-RIP | ||
1643 | |||
1644 | /* Everything up to here is safe from nested NMIs */ | ||
1645 | |||
1646 | /* | ||
1647 | * If there was a nested NMI, the first NMI's iret will return | ||
1648 | * here. But NMIs are still enabled and we can take another | ||
1649 | * nested NMI. The nested NMI checks the interrupted RIP to see | ||
1650 | * if it is between repeat_nmi and end_repeat_nmi, and if so | ||
1651 | * it will just return, as we are about to repeat an NMI anyway. | ||
1652 | * This makes it safe to copy to the stack frame that a nested | ||
1653 | * NMI will update. | ||
1654 | */ | ||
1655 | repeat_nmi: | ||
1656 | /* | ||
1657 | * Update the stack variable to say we are still in NMI (the update | ||
1658 | * is benign for the non-repeat case, where 1 was pushed just above | ||
1659 | * to this very stack slot). | ||
1660 | */ | ||
1661 | movq $1, 5*8(%rsp) | ||
1627 | 1662 | ||
1628 | /* Make another copy, this one may be modified by nested NMIs */ | 1663 | /* Make another copy, this one may be modified by nested NMIs */ |
1629 | .rept 5 | 1664 | .rept 5 |
1630 | pushq_cfi 4*8(%rsp) | 1665 | pushq_cfi 4*8(%rsp) |
1631 | .endr | 1666 | .endr |
1632 | 1667 | CFI_DEF_CFA_OFFSET SS+8-RIP | |
1633 | /* Do not pop rdx, nested NMIs will corrupt it */ | 1668 | end_repeat_nmi: |
1634 | movq 11*8(%rsp), %rdx | ||
1635 | 1669 | ||
1636 | /* | 1670 | /* |
1637 | * Everything below this point can be preempted by a nested | 1671 | * Everything below this point can be preempted by a nested |
1638 | * NMI if the first NMI took an exception. Repeated NMIs | 1672 | * NMI if the first NMI took an exception and reset our iret stack |
1639 | * caused by an exception and nested NMI will start here, and | 1673 | * so that we repeat another NMI. |
1640 | * can still be preempted by another NMI. | ||
1641 | */ | 1674 | */ |
1642 | restart_nmi: | ||
1643 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ | 1675 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ |
1644 | subq $ORIG_RAX-R15, %rsp | 1676 | subq $ORIG_RAX-R15, %rsp |
1645 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 | 1677 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 |
@@ -1668,26 +1700,6 @@ nmi_restore: | |||
1668 | CFI_ENDPROC | 1700 | CFI_ENDPROC |
1669 | END(nmi) | 1701 | END(nmi) |
1670 | 1702 | ||
1671 | /* | ||
1672 | * If an NMI hit an iret because of an exception or breakpoint, | ||
1673 | * it can lose its NMI context, and a nested NMI may come in. | ||
1674 | * In that case, the nested NMI will change the preempted NMI's | ||
1675 | * stack to jump to here when it does the final iret. | ||
1676 | */ | ||
1677 | repeat_nmi: | ||
1678 | INTR_FRAME | ||
1679 | /* Update the stack variable to say we are still in NMI */ | ||
1680 | movq $1, 5*8(%rsp) | ||
1681 | |||
1682 | /* copy the saved stack back to copy stack */ | ||
1683 | .rept 5 | ||
1684 | pushq_cfi 4*8(%rsp) | ||
1685 | .endr | ||
1686 | |||
1687 | jmp restart_nmi | ||
1688 | CFI_ENDPROC | ||
1689 | end_repeat_nmi: | ||
1690 | |||
1691 | ENTRY(ignore_sysret) | 1703 | ENTRY(ignore_sysret) |
1692 | CFI_STARTPROC | 1704 | CFI_STARTPROC |
1693 | mov $-ENOSYS,%eax | 1705 | mov $-ENOSYS,%eax |
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 739d8598f789..7734bcbb5a3a 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <asm/uaccess.h> | 16 | #include <asm/uaccess.h> |
17 | #include <asm/ptrace.h> | 17 | #include <asm/ptrace.h> |
18 | #include <asm/i387.h> | 18 | #include <asm/i387.h> |
19 | #include <asm/fpu-internal.h> | ||
19 | #include <asm/user.h> | 20 | #include <asm/user.h> |
20 | 21 | ||
21 | #ifdef CONFIG_X86_64 | 22 | #ifdef CONFIG_X86_64 |
@@ -32,6 +33,86 @@ | |||
32 | # define user32_fxsr_struct user_fxsr_struct | 33 | # define user32_fxsr_struct user_fxsr_struct |
33 | #endif | 34 | #endif |
34 | 35 | ||
36 | /* | ||
37 | * Were we in an interrupt that interrupted kernel mode? | ||
38 | * | ||
39 | * We can do a kernel_fpu_begin/end() pair *ONLY* if that | ||
40 | * pair does nothing at all: the thread must not have fpu (so | ||
41 | * that we don't try to save the FPU state), and TS must | ||
42 | * be set (so that the clts/stts pair does nothing that is | ||
43 | * visible in the interrupted kernel thread). | ||
44 | */ | ||
45 | static inline bool interrupted_kernel_fpu_idle(void) | ||
46 | { | ||
47 | return !__thread_has_fpu(current) && | ||
48 | (read_cr0() & X86_CR0_TS); | ||
49 | } | ||
50 | |||
51 | /* | ||
52 | * Were we in user mode (or vm86 mode) when we were | ||
53 | * interrupted? | ||
54 | * | ||
55 | * Doing kernel_fpu_begin/end() is ok if we are running | ||
56 | * in an interrupt context from user mode - we'll just | ||
57 | * save the FPU state as required. | ||
58 | */ | ||
59 | static inline bool interrupted_user_mode(void) | ||
60 | { | ||
61 | struct pt_regs *regs = get_irq_regs(); | ||
62 | return regs && user_mode_vm(regs); | ||
63 | } | ||
64 | |||
65 | /* | ||
66 | * Can we use the FPU in kernel mode with the | ||
67 | * whole "kernel_fpu_begin/end()" sequence? | ||
68 | * | ||
69 | * It's always ok in process context (ie "not interrupt") | ||
70 | * but it is sometimes ok even from an irq. | ||
71 | */ | ||
72 | bool irq_fpu_usable(void) | ||
73 | { | ||
74 | return !in_interrupt() || | ||
75 | interrupted_user_mode() || | ||
76 | interrupted_kernel_fpu_idle(); | ||
77 | } | ||
78 | EXPORT_SYMBOL(irq_fpu_usable); | ||
79 | |||
80 | void kernel_fpu_begin(void) | ||
81 | { | ||
82 | struct task_struct *me = current; | ||
83 | |||
84 | WARN_ON_ONCE(!irq_fpu_usable()); | ||
85 | preempt_disable(); | ||
86 | if (__thread_has_fpu(me)) { | ||
87 | __save_init_fpu(me); | ||
88 | __thread_clear_has_fpu(me); | ||
89 | /* We do 'stts()' in kernel_fpu_end() */ | ||
90 | } else { | ||
91 | percpu_write(fpu_owner_task, NULL); | ||
92 | clts(); | ||
93 | } | ||
94 | } | ||
95 | EXPORT_SYMBOL(kernel_fpu_begin); | ||
96 | |||
97 | void kernel_fpu_end(void) | ||
98 | { | ||
99 | stts(); | ||
100 | preempt_enable(); | ||
101 | } | ||
102 | EXPORT_SYMBOL(kernel_fpu_end); | ||
103 | |||
104 | void unlazy_fpu(struct task_struct *tsk) | ||
105 | { | ||
106 | preempt_disable(); | ||
107 | if (__thread_has_fpu(tsk)) { | ||
108 | __save_init_fpu(tsk); | ||
109 | __thread_fpu_end(tsk); | ||
110 | } else | ||
111 | tsk->fpu_counter = 0; | ||
112 | preempt_enable(); | ||
113 | } | ||
114 | EXPORT_SYMBOL(unlazy_fpu); | ||
115 | |||
35 | #ifdef CONFIG_MATH_EMULATION | 116 | #ifdef CONFIG_MATH_EMULATION |
36 | # define HAVE_HWFP (boot_cpu_data.hard_math) | 117 | # define HAVE_HWFP (boot_cpu_data.hard_math) |
37 | #else | 118 | #else |
@@ -44,7 +125,7 @@ EXPORT_SYMBOL_GPL(xstate_size); | |||
44 | unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); | 125 | unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); |
45 | static struct i387_fxsave_struct fx_scratch __cpuinitdata; | 126 | static struct i387_fxsave_struct fx_scratch __cpuinitdata; |
46 | 127 | ||
47 | void __cpuinit mxcsr_feature_mask_init(void) | 128 | static void __cpuinit mxcsr_feature_mask_init(void) |
48 | { | 129 | { |
49 | unsigned long mask = 0; | 130 | unsigned long mask = 0; |
50 | 131 | ||
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 40fc86161d92..58b7f27cb3e9 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c | |||
@@ -100,13 +100,8 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) | |||
100 | irqctx->tinfo.task = curctx->tinfo.task; | 100 | irqctx->tinfo.task = curctx->tinfo.task; |
101 | irqctx->tinfo.previous_esp = current_stack_pointer; | 101 | irqctx->tinfo.previous_esp = current_stack_pointer; |
102 | 102 | ||
103 | /* | 103 | /* Copy the preempt_count so that the [soft]irq checks work. */ |
104 | * Copy the softirq bits in preempt_count so that the | 104 | irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count; |
105 | * softirq checks work in the hardirq context. | ||
106 | */ | ||
107 | irqctx->tinfo.preempt_count = | ||
108 | (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | | ||
109 | (curctx->tinfo.preempt_count & SOFTIRQ_MASK); | ||
110 | 105 | ||
111 | if (unlikely(overflow)) | 106 | if (unlikely(overflow)) |
112 | call_on_stack(print_stack_overflow, isp); | 107 | call_on_stack(print_stack_overflow, isp); |
@@ -196,7 +191,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) | |||
196 | if (unlikely(!desc)) | 191 | if (unlikely(!desc)) |
197 | return false; | 192 | return false; |
198 | 193 | ||
199 | if (!execute_on_irq_stack(overflow, desc, irq)) { | 194 | if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) { |
200 | if (unlikely(overflow)) | 195 | if (unlikely(overflow)) |
201 | print_stack_overflow(); | 196 | print_stack_overflow(); |
202 | desc->handle_irq(irq, desc); | 197 | desc->handle_irq(irq, desc); |
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 313fb5cddbce..43e2b1cff0a7 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c | |||
@@ -306,10 +306,10 @@ void __init native_init_IRQ(void) | |||
306 | * us. (some of these will be overridden and become | 306 | * us. (some of these will be overridden and become |
307 | * 'special' SMP interrupts) | 307 | * 'special' SMP interrupts) |
308 | */ | 308 | */ |
309 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { | 309 | i = FIRST_EXTERNAL_VECTOR; |
310 | for_each_clear_bit_from(i, used_vectors, NR_VECTORS) { | ||
310 | /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ | 311 | /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ |
311 | if (!test_bit(i, used_vectors)) | 312 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); |
312 | set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); | ||
313 | } | 313 | } |
314 | 314 | ||
315 | if (!acpi_ioapic && !of_ioapic) | 315 | if (!acpi_ioapic && !of_ioapic) |
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index faba5771acad..fdc37b3d0ce3 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c | |||
@@ -67,8 +67,6 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = | |||
67 | { "ss", 4, offsetof(struct pt_regs, ss) }, | 67 | { "ss", 4, offsetof(struct pt_regs, ss) }, |
68 | { "ds", 4, offsetof(struct pt_regs, ds) }, | 68 | { "ds", 4, offsetof(struct pt_regs, ds) }, |
69 | { "es", 4, offsetof(struct pt_regs, es) }, | 69 | { "es", 4, offsetof(struct pt_regs, es) }, |
70 | { "fs", 4, -1 }, | ||
71 | { "gs", 4, -1 }, | ||
72 | #else | 70 | #else |
73 | { "ax", 8, offsetof(struct pt_regs, ax) }, | 71 | { "ax", 8, offsetof(struct pt_regs, ax) }, |
74 | { "bx", 8, offsetof(struct pt_regs, bx) }, | 72 | { "bx", 8, offsetof(struct pt_regs, bx) }, |
@@ -90,7 +88,11 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = | |||
90 | { "flags", 4, offsetof(struct pt_regs, flags) }, | 88 | { "flags", 4, offsetof(struct pt_regs, flags) }, |
91 | { "cs", 4, offsetof(struct pt_regs, cs) }, | 89 | { "cs", 4, offsetof(struct pt_regs, cs) }, |
92 | { "ss", 4, offsetof(struct pt_regs, ss) }, | 90 | { "ss", 4, offsetof(struct pt_regs, ss) }, |
91 | { "ds", 4, -1 }, | ||
92 | { "es", 4, -1 }, | ||
93 | #endif | 93 | #endif |
94 | { "fs", 4, -1 }, | ||
95 | { "gs", 4, -1 }, | ||
94 | }; | 96 | }; |
95 | 97 | ||
96 | int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) | 98 | int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) |
diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes-common.h new file mode 100644 index 000000000000..3230b68ef29a --- /dev/null +++ b/arch/x86/kernel/kprobes-common.h | |||
@@ -0,0 +1,102 @@ | |||
1 | #ifndef __X86_KERNEL_KPROBES_COMMON_H | ||
2 | #define __X86_KERNEL_KPROBES_COMMON_H | ||
3 | |||
4 | /* Kprobes and Optprobes common header */ | ||
5 | |||
6 | #ifdef CONFIG_X86_64 | ||
7 | #define SAVE_REGS_STRING \ | ||
8 | /* Skip cs, ip, orig_ax. */ \ | ||
9 | " subq $24, %rsp\n" \ | ||
10 | " pushq %rdi\n" \ | ||
11 | " pushq %rsi\n" \ | ||
12 | " pushq %rdx\n" \ | ||
13 | " pushq %rcx\n" \ | ||
14 | " pushq %rax\n" \ | ||
15 | " pushq %r8\n" \ | ||
16 | " pushq %r9\n" \ | ||
17 | " pushq %r10\n" \ | ||
18 | " pushq %r11\n" \ | ||
19 | " pushq %rbx\n" \ | ||
20 | " pushq %rbp\n" \ | ||
21 | " pushq %r12\n" \ | ||
22 | " pushq %r13\n" \ | ||
23 | " pushq %r14\n" \ | ||
24 | " pushq %r15\n" | ||
25 | #define RESTORE_REGS_STRING \ | ||
26 | " popq %r15\n" \ | ||
27 | " popq %r14\n" \ | ||
28 | " popq %r13\n" \ | ||
29 | " popq %r12\n" \ | ||
30 | " popq %rbp\n" \ | ||
31 | " popq %rbx\n" \ | ||
32 | " popq %r11\n" \ | ||
33 | " popq %r10\n" \ | ||
34 | " popq %r9\n" \ | ||
35 | " popq %r8\n" \ | ||
36 | " popq %rax\n" \ | ||
37 | " popq %rcx\n" \ | ||
38 | " popq %rdx\n" \ | ||
39 | " popq %rsi\n" \ | ||
40 | " popq %rdi\n" \ | ||
41 | /* Skip orig_ax, ip, cs */ \ | ||
42 | " addq $24, %rsp\n" | ||
43 | #else | ||
44 | #define SAVE_REGS_STRING \ | ||
45 | /* Skip cs, ip, orig_ax and gs. */ \ | ||
46 | " subl $16, %esp\n" \ | ||
47 | " pushl %fs\n" \ | ||
48 | " pushl %es\n" \ | ||
49 | " pushl %ds\n" \ | ||
50 | " pushl %eax\n" \ | ||
51 | " pushl %ebp\n" \ | ||
52 | " pushl %edi\n" \ | ||
53 | " pushl %esi\n" \ | ||
54 | " pushl %edx\n" \ | ||
55 | " pushl %ecx\n" \ | ||
56 | " pushl %ebx\n" | ||
57 | #define RESTORE_REGS_STRING \ | ||
58 | " popl %ebx\n" \ | ||
59 | " popl %ecx\n" \ | ||
60 | " popl %edx\n" \ | ||
61 | " popl %esi\n" \ | ||
62 | " popl %edi\n" \ | ||
63 | " popl %ebp\n" \ | ||
64 | " popl %eax\n" \ | ||
65 | /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ | ||
66 | " addl $24, %esp\n" | ||
67 | #endif | ||
68 | |||
69 | /* Ensure if the instruction can be boostable */ | ||
70 | extern int can_boost(kprobe_opcode_t *instruction); | ||
71 | /* Recover instruction if given address is probed */ | ||
72 | extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, | ||
73 | unsigned long addr); | ||
74 | /* | ||
75 | * Copy an instruction and adjust the displacement if the instruction | ||
76 | * uses the %rip-relative addressing mode. | ||
77 | */ | ||
78 | extern int __copy_instruction(u8 *dest, u8 *src); | ||
79 | |||
80 | /* Generate a relative-jump/call instruction */ | ||
81 | extern void synthesize_reljump(void *from, void *to); | ||
82 | extern void synthesize_relcall(void *from, void *to); | ||
83 | |||
84 | #ifdef CONFIG_OPTPROBES | ||
85 | extern int arch_init_optprobes(void); | ||
86 | extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter); | ||
87 | extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr); | ||
88 | #else /* !CONFIG_OPTPROBES */ | ||
89 | static inline int arch_init_optprobes(void) | ||
90 | { | ||
91 | return 0; | ||
92 | } | ||
93 | static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) | ||
94 | { | ||
95 | return 0; | ||
96 | } | ||
97 | static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) | ||
98 | { | ||
99 | return addr; | ||
100 | } | ||
101 | #endif | ||
102 | #endif | ||
diff --git a/arch/x86/kernel/kprobes-opt.c b/arch/x86/kernel/kprobes-opt.c new file mode 100644 index 000000000000..c5e410eed403 --- /dev/null +++ b/arch/x86/kernel/kprobes-opt.c | |||
@@ -0,0 +1,512 @@ | |||
1 | /* | ||
2 | * Kernel Probes Jump Optimization (Optprobes) | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright (C) IBM Corporation, 2002, 2004 | ||
19 | * Copyright (C) Hitachi Ltd., 2012 | ||
20 | */ | ||
21 | #include <linux/kprobes.h> | ||
22 | #include <linux/ptrace.h> | ||
23 | #include <linux/string.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/hardirq.h> | ||
26 | #include <linux/preempt.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <linux/kdebug.h> | ||
29 | #include <linux/kallsyms.h> | ||
30 | #include <linux/ftrace.h> | ||
31 | |||
32 | #include <asm/cacheflush.h> | ||
33 | #include <asm/desc.h> | ||
34 | #include <asm/pgtable.h> | ||
35 | #include <asm/uaccess.h> | ||
36 | #include <asm/alternative.h> | ||
37 | #include <asm/insn.h> | ||
38 | #include <asm/debugreg.h> | ||
39 | |||
40 | #include "kprobes-common.h" | ||
41 | |||
42 | unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) | ||
43 | { | ||
44 | struct optimized_kprobe *op; | ||
45 | struct kprobe *kp; | ||
46 | long offs; | ||
47 | int i; | ||
48 | |||
49 | for (i = 0; i < RELATIVEJUMP_SIZE; i++) { | ||
50 | kp = get_kprobe((void *)addr - i); | ||
51 | /* This function only handles jump-optimized kprobe */ | ||
52 | if (kp && kprobe_optimized(kp)) { | ||
53 | op = container_of(kp, struct optimized_kprobe, kp); | ||
54 | /* If op->list is not empty, op is under optimizing */ | ||
55 | if (list_empty(&op->list)) | ||
56 | goto found; | ||
57 | } | ||
58 | } | ||
59 | |||
60 | return addr; | ||
61 | found: | ||
62 | /* | ||
63 | * If the kprobe can be optimized, original bytes which can be | ||
64 | * overwritten by jump destination address. In this case, original | ||
65 | * bytes must be recovered from op->optinsn.copied_insn buffer. | ||
66 | */ | ||
67 | memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); | ||
68 | if (addr == (unsigned long)kp->addr) { | ||
69 | buf[0] = kp->opcode; | ||
70 | memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); | ||
71 | } else { | ||
72 | offs = addr - (unsigned long)kp->addr - 1; | ||
73 | memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs); | ||
74 | } | ||
75 | |||
76 | return (unsigned long)buf; | ||
77 | } | ||
78 | |||
79 | /* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ | ||
80 | static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) | ||
81 | { | ||
82 | #ifdef CONFIG_X86_64 | ||
83 | *addr++ = 0x48; | ||
84 | *addr++ = 0xbf; | ||
85 | #else | ||
86 | *addr++ = 0xb8; | ||
87 | #endif | ||
88 | *(unsigned long *)addr = val; | ||
89 | } | ||
90 | |||
91 | static void __used __kprobes kprobes_optinsn_template_holder(void) | ||
92 | { | ||
93 | asm volatile ( | ||
94 | ".global optprobe_template_entry\n" | ||
95 | "optprobe_template_entry:\n" | ||
96 | #ifdef CONFIG_X86_64 | ||
97 | /* We don't bother saving the ss register */ | ||
98 | " pushq %rsp\n" | ||
99 | " pushfq\n" | ||
100 | SAVE_REGS_STRING | ||
101 | " movq %rsp, %rsi\n" | ||
102 | ".global optprobe_template_val\n" | ||
103 | "optprobe_template_val:\n" | ||
104 | ASM_NOP5 | ||
105 | ASM_NOP5 | ||
106 | ".global optprobe_template_call\n" | ||
107 | "optprobe_template_call:\n" | ||
108 | ASM_NOP5 | ||
109 | /* Move flags to rsp */ | ||
110 | " movq 144(%rsp), %rdx\n" | ||
111 | " movq %rdx, 152(%rsp)\n" | ||
112 | RESTORE_REGS_STRING | ||
113 | /* Skip flags entry */ | ||
114 | " addq $8, %rsp\n" | ||
115 | " popfq\n" | ||
116 | #else /* CONFIG_X86_32 */ | ||
117 | " pushf\n" | ||
118 | SAVE_REGS_STRING | ||
119 | " movl %esp, %edx\n" | ||
120 | ".global optprobe_template_val\n" | ||
121 | "optprobe_template_val:\n" | ||
122 | ASM_NOP5 | ||
123 | ".global optprobe_template_call\n" | ||
124 | "optprobe_template_call:\n" | ||
125 | ASM_NOP5 | ||
126 | RESTORE_REGS_STRING | ||
127 | " addl $4, %esp\n" /* skip cs */ | ||
128 | " popf\n" | ||
129 | #endif | ||
130 | ".global optprobe_template_end\n" | ||
131 | "optprobe_template_end:\n"); | ||
132 | } | ||
133 | |||
134 | #define TMPL_MOVE_IDX \ | ||
135 | ((long)&optprobe_template_val - (long)&optprobe_template_entry) | ||
136 | #define TMPL_CALL_IDX \ | ||
137 | ((long)&optprobe_template_call - (long)&optprobe_template_entry) | ||
138 | #define TMPL_END_IDX \ | ||
139 | ((long)&optprobe_template_end - (long)&optprobe_template_entry) | ||
140 | |||
141 | #define INT3_SIZE sizeof(kprobe_opcode_t) | ||
142 | |||
143 | /* Optimized kprobe call back function: called from optinsn */ | ||
144 | static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) | ||
145 | { | ||
146 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
147 | unsigned long flags; | ||
148 | |||
149 | /* This is possible if op is under delayed unoptimizing */ | ||
150 | if (kprobe_disabled(&op->kp)) | ||
151 | return; | ||
152 | |||
153 | local_irq_save(flags); | ||
154 | if (kprobe_running()) { | ||
155 | kprobes_inc_nmissed_count(&op->kp); | ||
156 | } else { | ||
157 | /* Save skipped registers */ | ||
158 | #ifdef CONFIG_X86_64 | ||
159 | regs->cs = __KERNEL_CS; | ||
160 | #else | ||
161 | regs->cs = __KERNEL_CS | get_kernel_rpl(); | ||
162 | regs->gs = 0; | ||
163 | #endif | ||
164 | regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; | ||
165 | regs->orig_ax = ~0UL; | ||
166 | |||
167 | __this_cpu_write(current_kprobe, &op->kp); | ||
168 | kcb->kprobe_status = KPROBE_HIT_ACTIVE; | ||
169 | opt_pre_handler(&op->kp, regs); | ||
170 | __this_cpu_write(current_kprobe, NULL); | ||
171 | } | ||
172 | local_irq_restore(flags); | ||
173 | } | ||
174 | |||
175 | static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) | ||
176 | { | ||
177 | int len = 0, ret; | ||
178 | |||
179 | while (len < RELATIVEJUMP_SIZE) { | ||
180 | ret = __copy_instruction(dest + len, src + len); | ||
181 | if (!ret || !can_boost(dest + len)) | ||
182 | return -EINVAL; | ||
183 | len += ret; | ||
184 | } | ||
185 | /* Check whether the address range is reserved */ | ||
186 | if (ftrace_text_reserved(src, src + len - 1) || | ||
187 | alternatives_text_reserved(src, src + len - 1) || | ||
188 | jump_label_text_reserved(src, src + len - 1)) | ||
189 | return -EBUSY; | ||
190 | |||
191 | return len; | ||
192 | } | ||
193 | |||
194 | /* Check whether insn is indirect jump */ | ||
195 | static int __kprobes insn_is_indirect_jump(struct insn *insn) | ||
196 | { | ||
197 | return ((insn->opcode.bytes[0] == 0xff && | ||
198 | (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ | ||
199 | insn->opcode.bytes[0] == 0xea); /* Segment based jump */ | ||
200 | } | ||
201 | |||
202 | /* Check whether insn jumps into specified address range */ | ||
203 | static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) | ||
204 | { | ||
205 | unsigned long target = 0; | ||
206 | |||
207 | switch (insn->opcode.bytes[0]) { | ||
208 | case 0xe0: /* loopne */ | ||
209 | case 0xe1: /* loope */ | ||
210 | case 0xe2: /* loop */ | ||
211 | case 0xe3: /* jcxz */ | ||
212 | case 0xe9: /* near relative jump */ | ||
213 | case 0xeb: /* short relative jump */ | ||
214 | break; | ||
215 | case 0x0f: | ||
216 | if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ | ||
217 | break; | ||
218 | return 0; | ||
219 | default: | ||
220 | if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ | ||
221 | break; | ||
222 | return 0; | ||
223 | } | ||
224 | target = (unsigned long)insn->next_byte + insn->immediate.value; | ||
225 | |||
226 | return (start <= target && target <= start + len); | ||
227 | } | ||
228 | |||
229 | /* Decode whole function to ensure any instructions don't jump into target */ | ||
230 | static int __kprobes can_optimize(unsigned long paddr) | ||
231 | { | ||
232 | unsigned long addr, size = 0, offset = 0; | ||
233 | struct insn insn; | ||
234 | kprobe_opcode_t buf[MAX_INSN_SIZE]; | ||
235 | |||
236 | /* Lookup symbol including addr */ | ||
237 | if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) | ||
238 | return 0; | ||
239 | |||
240 | /* | ||
241 | * Do not optimize in the entry code due to the unstable | ||
242 | * stack handling. | ||
243 | */ | ||
244 | if ((paddr >= (unsigned long)__entry_text_start) && | ||
245 | (paddr < (unsigned long)__entry_text_end)) | ||
246 | return 0; | ||
247 | |||
248 | /* Check there is enough space for a relative jump. */ | ||
249 | if (size - offset < RELATIVEJUMP_SIZE) | ||
250 | return 0; | ||
251 | |||
252 | /* Decode instructions */ | ||
253 | addr = paddr - offset; | ||
254 | while (addr < paddr - offset + size) { /* Decode until function end */ | ||
255 | if (search_exception_tables(addr)) | ||
256 | /* | ||
257 | * Since some fixup code will jumps into this function, | ||
258 | * we can't optimize kprobe in this function. | ||
259 | */ | ||
260 | return 0; | ||
261 | kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); | ||
262 | insn_get_length(&insn); | ||
263 | /* Another subsystem puts a breakpoint */ | ||
264 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) | ||
265 | return 0; | ||
266 | /* Recover address */ | ||
267 | insn.kaddr = (void *)addr; | ||
268 | insn.next_byte = (void *)(addr + insn.length); | ||
269 | /* Check any instructions don't jump into target */ | ||
270 | if (insn_is_indirect_jump(&insn) || | ||
271 | insn_jump_into_range(&insn, paddr + INT3_SIZE, | ||
272 | RELATIVE_ADDR_SIZE)) | ||
273 | return 0; | ||
274 | addr += insn.length; | ||
275 | } | ||
276 | |||
277 | return 1; | ||
278 | } | ||
279 | |||
280 | /* Check optimized_kprobe can actually be optimized. */ | ||
281 | int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) | ||
282 | { | ||
283 | int i; | ||
284 | struct kprobe *p; | ||
285 | |||
286 | for (i = 1; i < op->optinsn.size; i++) { | ||
287 | p = get_kprobe(op->kp.addr + i); | ||
288 | if (p && !kprobe_disabled(p)) | ||
289 | return -EEXIST; | ||
290 | } | ||
291 | |||
292 | return 0; | ||
293 | } | ||
294 | |||
295 | /* Check the addr is within the optimized instructions. */ | ||
296 | int __kprobes | ||
297 | arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr) | ||
298 | { | ||
299 | return ((unsigned long)op->kp.addr <= addr && | ||
300 | (unsigned long)op->kp.addr + op->optinsn.size > addr); | ||
301 | } | ||
302 | |||
303 | /* Free optimized instruction slot */ | ||
304 | static __kprobes | ||
305 | void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) | ||
306 | { | ||
307 | if (op->optinsn.insn) { | ||
308 | free_optinsn_slot(op->optinsn.insn, dirty); | ||
309 | op->optinsn.insn = NULL; | ||
310 | op->optinsn.size = 0; | ||
311 | } | ||
312 | } | ||
313 | |||
314 | void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) | ||
315 | { | ||
316 | __arch_remove_optimized_kprobe(op, 1); | ||
317 | } | ||
318 | |||
319 | /* | ||
320 | * Copy replacing target instructions | ||
321 | * Target instructions MUST be relocatable (checked inside) | ||
322 | * This is called when new aggr(opt)probe is allocated or reused. | ||
323 | */ | ||
324 | int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) | ||
325 | { | ||
326 | u8 *buf; | ||
327 | int ret; | ||
328 | long rel; | ||
329 | |||
330 | if (!can_optimize((unsigned long)op->kp.addr)) | ||
331 | return -EILSEQ; | ||
332 | |||
333 | op->optinsn.insn = get_optinsn_slot(); | ||
334 | if (!op->optinsn.insn) | ||
335 | return -ENOMEM; | ||
336 | |||
337 | /* | ||
338 | * Verify if the address gap is in 2GB range, because this uses | ||
339 | * a relative jump. | ||
340 | */ | ||
341 | rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; | ||
342 | if (abs(rel) > 0x7fffffff) | ||
343 | return -ERANGE; | ||
344 | |||
345 | buf = (u8 *)op->optinsn.insn; | ||
346 | |||
347 | /* Copy instructions into the out-of-line buffer */ | ||
348 | ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); | ||
349 | if (ret < 0) { | ||
350 | __arch_remove_optimized_kprobe(op, 0); | ||
351 | return ret; | ||
352 | } | ||
353 | op->optinsn.size = ret; | ||
354 | |||
355 | /* Copy arch-dep-instance from template */ | ||
356 | memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); | ||
357 | |||
358 | /* Set probe information */ | ||
359 | synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); | ||
360 | |||
361 | /* Set probe function call */ | ||
362 | synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); | ||
363 | |||
364 | /* Set returning jmp instruction at the tail of out-of-line buffer */ | ||
365 | synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, | ||
366 | (u8 *)op->kp.addr + op->optinsn.size); | ||
367 | |||
368 | flush_icache_range((unsigned long) buf, | ||
369 | (unsigned long) buf + TMPL_END_IDX + | ||
370 | op->optinsn.size + RELATIVEJUMP_SIZE); | ||
371 | return 0; | ||
372 | } | ||
373 | |||
374 | #define MAX_OPTIMIZE_PROBES 256 | ||
375 | static struct text_poke_param *jump_poke_params; | ||
376 | static struct jump_poke_buffer { | ||
377 | u8 buf[RELATIVEJUMP_SIZE]; | ||
378 | } *jump_poke_bufs; | ||
379 | |||
380 | static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, | ||
381 | u8 *insn_buf, | ||
382 | struct optimized_kprobe *op) | ||
383 | { | ||
384 | s32 rel = (s32)((long)op->optinsn.insn - | ||
385 | ((long)op->kp.addr + RELATIVEJUMP_SIZE)); | ||
386 | |||
387 | /* Backup instructions which will be replaced by jump address */ | ||
388 | memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, | ||
389 | RELATIVE_ADDR_SIZE); | ||
390 | |||
391 | insn_buf[0] = RELATIVEJUMP_OPCODE; | ||
392 | *(s32 *)(&insn_buf[1]) = rel; | ||
393 | |||
394 | tprm->addr = op->kp.addr; | ||
395 | tprm->opcode = insn_buf; | ||
396 | tprm->len = RELATIVEJUMP_SIZE; | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * Replace breakpoints (int3) with relative jumps. | ||
401 | * Caller must call with locking kprobe_mutex and text_mutex. | ||
402 | */ | ||
403 | void __kprobes arch_optimize_kprobes(struct list_head *oplist) | ||
404 | { | ||
405 | struct optimized_kprobe *op, *tmp; | ||
406 | int c = 0; | ||
407 | |||
408 | list_for_each_entry_safe(op, tmp, oplist, list) { | ||
409 | WARN_ON(kprobe_disabled(&op->kp)); | ||
410 | /* Setup param */ | ||
411 | setup_optimize_kprobe(&jump_poke_params[c], | ||
412 | jump_poke_bufs[c].buf, op); | ||
413 | list_del_init(&op->list); | ||
414 | if (++c >= MAX_OPTIMIZE_PROBES) | ||
415 | break; | ||
416 | } | ||
417 | |||
418 | /* | ||
419 | * text_poke_smp doesn't support NMI/MCE code modifying. | ||
420 | * However, since kprobes itself also doesn't support NMI/MCE | ||
421 | * code probing, it's not a problem. | ||
422 | */ | ||
423 | text_poke_smp_batch(jump_poke_params, c); | ||
424 | } | ||
425 | |||
426 | static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, | ||
427 | u8 *insn_buf, | ||
428 | struct optimized_kprobe *op) | ||
429 | { | ||
430 | /* Set int3 to first byte for kprobes */ | ||
431 | insn_buf[0] = BREAKPOINT_INSTRUCTION; | ||
432 | memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); | ||
433 | |||
434 | tprm->addr = op->kp.addr; | ||
435 | tprm->opcode = insn_buf; | ||
436 | tprm->len = RELATIVEJUMP_SIZE; | ||
437 | } | ||
438 | |||
439 | /* | ||
440 | * Recover original instructions and breakpoints from relative jumps. | ||
441 | * Caller must call with locking kprobe_mutex. | ||
442 | */ | ||
443 | extern void arch_unoptimize_kprobes(struct list_head *oplist, | ||
444 | struct list_head *done_list) | ||
445 | { | ||
446 | struct optimized_kprobe *op, *tmp; | ||
447 | int c = 0; | ||
448 | |||
449 | list_for_each_entry_safe(op, tmp, oplist, list) { | ||
450 | /* Setup param */ | ||
451 | setup_unoptimize_kprobe(&jump_poke_params[c], | ||
452 | jump_poke_bufs[c].buf, op); | ||
453 | list_move(&op->list, done_list); | ||
454 | if (++c >= MAX_OPTIMIZE_PROBES) | ||
455 | break; | ||
456 | } | ||
457 | |||
458 | /* | ||
459 | * text_poke_smp doesn't support NMI/MCE code modifying. | ||
460 | * However, since kprobes itself also doesn't support NMI/MCE | ||
461 | * code probing, it's not a problem. | ||
462 | */ | ||
463 | text_poke_smp_batch(jump_poke_params, c); | ||
464 | } | ||
465 | |||
466 | /* Replace a relative jump with a breakpoint (int3). */ | ||
467 | void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) | ||
468 | { | ||
469 | u8 buf[RELATIVEJUMP_SIZE]; | ||
470 | |||
471 | /* Set int3 to first byte for kprobes */ | ||
472 | buf[0] = BREAKPOINT_INSTRUCTION; | ||
473 | memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); | ||
474 | text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); | ||
475 | } | ||
476 | |||
477 | int __kprobes | ||
478 | setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) | ||
479 | { | ||
480 | struct optimized_kprobe *op; | ||
481 | |||
482 | if (p->flags & KPROBE_FLAG_OPTIMIZED) { | ||
483 | /* This kprobe is really able to run optimized path. */ | ||
484 | op = container_of(p, struct optimized_kprobe, kp); | ||
485 | /* Detour through copied instructions */ | ||
486 | regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; | ||
487 | if (!reenter) | ||
488 | reset_current_kprobe(); | ||
489 | preempt_enable_no_resched(); | ||
490 | return 1; | ||
491 | } | ||
492 | return 0; | ||
493 | } | ||
494 | |||
495 | int __kprobes arch_init_optprobes(void) | ||
496 | { | ||
497 | /* Allocate code buffer and parameter array */ | ||
498 | jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * | ||
499 | MAX_OPTIMIZE_PROBES, GFP_KERNEL); | ||
500 | if (!jump_poke_bufs) | ||
501 | return -ENOMEM; | ||
502 | |||
503 | jump_poke_params = kmalloc(sizeof(struct text_poke_param) * | ||
504 | MAX_OPTIMIZE_PROBES, GFP_KERNEL); | ||
505 | if (!jump_poke_params) { | ||
506 | kfree(jump_poke_bufs); | ||
507 | jump_poke_bufs = NULL; | ||
508 | return -ENOMEM; | ||
509 | } | ||
510 | |||
511 | return 0; | ||
512 | } | ||
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7da647d8b64c..e213fc8408d2 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c | |||
@@ -30,16 +30,15 @@ | |||
30 | * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi | 30 | * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi |
31 | * <prasanna@in.ibm.com> added function-return probes. | 31 | * <prasanna@in.ibm.com> added function-return probes. |
32 | * 2005-May Rusty Lynch <rusty.lynch@intel.com> | 32 | * 2005-May Rusty Lynch <rusty.lynch@intel.com> |
33 | * Added function return probes functionality | 33 | * Added function return probes functionality |
34 | * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added | 34 | * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added |
35 | * kprobe-booster and kretprobe-booster for i386. | 35 | * kprobe-booster and kretprobe-booster for i386. |
36 | * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster | 36 | * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster |
37 | * and kretprobe-booster for x86-64 | 37 | * and kretprobe-booster for x86-64 |
38 | * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven | 38 | * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven |
39 | * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com> | 39 | * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com> |
40 | * unified x86 kprobes code. | 40 | * unified x86 kprobes code. |
41 | */ | 41 | */ |
42 | |||
43 | #include <linux/kprobes.h> | 42 | #include <linux/kprobes.h> |
44 | #include <linux/ptrace.h> | 43 | #include <linux/ptrace.h> |
45 | #include <linux/string.h> | 44 | #include <linux/string.h> |
@@ -59,6 +58,8 @@ | |||
59 | #include <asm/insn.h> | 58 | #include <asm/insn.h> |
60 | #include <asm/debugreg.h> | 59 | #include <asm/debugreg.h> |
61 | 60 | ||
61 | #include "kprobes-common.h" | ||
62 | |||
62 | void jprobe_return_end(void); | 63 | void jprobe_return_end(void); |
63 | 64 | ||
64 | DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; | 65 | DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; |
@@ -108,6 +109,7 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { | |||
108 | doesn't switch kernel stack.*/ | 109 | doesn't switch kernel stack.*/ |
109 | {NULL, NULL} /* Terminator */ | 110 | {NULL, NULL} /* Terminator */ |
110 | }; | 111 | }; |
112 | |||
111 | const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); | 113 | const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); |
112 | 114 | ||
113 | static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) | 115 | static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) |
@@ -123,11 +125,17 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) | |||
123 | } | 125 | } |
124 | 126 | ||
125 | /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ | 127 | /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ |
126 | static void __kprobes synthesize_reljump(void *from, void *to) | 128 | void __kprobes synthesize_reljump(void *from, void *to) |
127 | { | 129 | { |
128 | __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); | 130 | __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); |
129 | } | 131 | } |
130 | 132 | ||
133 | /* Insert a call instruction at address 'from', which calls address 'to'.*/ | ||
134 | void __kprobes synthesize_relcall(void *from, void *to) | ||
135 | { | ||
136 | __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); | ||
137 | } | ||
138 | |||
131 | /* | 139 | /* |
132 | * Skip the prefixes of the instruction. | 140 | * Skip the prefixes of the instruction. |
133 | */ | 141 | */ |
@@ -151,7 +159,7 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn) | |||
151 | * Returns non-zero if opcode is boostable. | 159 | * Returns non-zero if opcode is boostable. |
152 | * RIP relative instructions are adjusted at copying time in 64 bits mode | 160 | * RIP relative instructions are adjusted at copying time in 64 bits mode |
153 | */ | 161 | */ |
154 | static int __kprobes can_boost(kprobe_opcode_t *opcodes) | 162 | int __kprobes can_boost(kprobe_opcode_t *opcodes) |
155 | { | 163 | { |
156 | kprobe_opcode_t opcode; | 164 | kprobe_opcode_t opcode; |
157 | kprobe_opcode_t *orig_opcodes = opcodes; | 165 | kprobe_opcode_t *orig_opcodes = opcodes; |
@@ -207,13 +215,15 @@ retry: | |||
207 | } | 215 | } |
208 | } | 216 | } |
209 | 217 | ||
210 | /* Recover the probed instruction at addr for further analysis. */ | 218 | static unsigned long |
211 | static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) | 219 | __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) |
212 | { | 220 | { |
213 | struct kprobe *kp; | 221 | struct kprobe *kp; |
222 | |||
214 | kp = get_kprobe((void *)addr); | 223 | kp = get_kprobe((void *)addr); |
224 | /* There is no probe, return original address */ | ||
215 | if (!kp) | 225 | if (!kp) |
216 | return -EINVAL; | 226 | return addr; |
217 | 227 | ||
218 | /* | 228 | /* |
219 | * Basically, kp->ainsn.insn has an original instruction. | 229 | * Basically, kp->ainsn.insn has an original instruction. |
@@ -230,14 +240,29 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) | |||
230 | */ | 240 | */ |
231 | memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); | 241 | memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); |
232 | buf[0] = kp->opcode; | 242 | buf[0] = kp->opcode; |
233 | return 0; | 243 | return (unsigned long)buf; |
244 | } | ||
245 | |||
246 | /* | ||
247 | * Recover the probed instruction at addr for further analysis. | ||
248 | * Caller must lock kprobes by kprobe_mutex, or disable preemption | ||
249 | * for preventing to release referencing kprobes. | ||
250 | */ | ||
251 | unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) | ||
252 | { | ||
253 | unsigned long __addr; | ||
254 | |||
255 | __addr = __recover_optprobed_insn(buf, addr); | ||
256 | if (__addr != addr) | ||
257 | return __addr; | ||
258 | |||
259 | return __recover_probed_insn(buf, addr); | ||
234 | } | 260 | } |
235 | 261 | ||
236 | /* Check if paddr is at an instruction boundary */ | 262 | /* Check if paddr is at an instruction boundary */ |
237 | static int __kprobes can_probe(unsigned long paddr) | 263 | static int __kprobes can_probe(unsigned long paddr) |
238 | { | 264 | { |
239 | int ret; | 265 | unsigned long addr, __addr, offset = 0; |
240 | unsigned long addr, offset = 0; | ||
241 | struct insn insn; | 266 | struct insn insn; |
242 | kprobe_opcode_t buf[MAX_INSN_SIZE]; | 267 | kprobe_opcode_t buf[MAX_INSN_SIZE]; |
243 | 268 | ||
@@ -247,26 +272,24 @@ static int __kprobes can_probe(unsigned long paddr) | |||
247 | /* Decode instructions */ | 272 | /* Decode instructions */ |
248 | addr = paddr - offset; | 273 | addr = paddr - offset; |
249 | while (addr < paddr) { | 274 | while (addr < paddr) { |
250 | kernel_insn_init(&insn, (void *)addr); | ||
251 | insn_get_opcode(&insn); | ||
252 | |||
253 | /* | 275 | /* |
254 | * Check if the instruction has been modified by another | 276 | * Check if the instruction has been modified by another |
255 | * kprobe, in which case we replace the breakpoint by the | 277 | * kprobe, in which case we replace the breakpoint by the |
256 | * original instruction in our buffer. | 278 | * original instruction in our buffer. |
279 | * Also, jump optimization will change the breakpoint to | ||
280 | * relative-jump. Since the relative-jump itself is | ||
281 | * normally used, we just go through if there is no kprobe. | ||
257 | */ | 282 | */ |
258 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { | 283 | __addr = recover_probed_instruction(buf, addr); |
259 | ret = recover_probed_instruction(buf, addr); | 284 | kernel_insn_init(&insn, (void *)__addr); |
260 | if (ret) | ||
261 | /* | ||
262 | * Another debugging subsystem might insert | ||
263 | * this breakpoint. In that case, we can't | ||
264 | * recover it. | ||
265 | */ | ||
266 | return 0; | ||
267 | kernel_insn_init(&insn, buf); | ||
268 | } | ||
269 | insn_get_length(&insn); | 285 | insn_get_length(&insn); |
286 | |||
287 | /* | ||
288 | * Another debugging subsystem might insert this breakpoint. | ||
289 | * In that case, we can't recover it. | ||
290 | */ | ||
291 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) | ||
292 | return 0; | ||
270 | addr += insn.length; | 293 | addr += insn.length; |
271 | } | 294 | } |
272 | 295 | ||
@@ -299,24 +322,16 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) | |||
299 | * If not, return null. | 322 | * If not, return null. |
300 | * Only applicable to 64-bit x86. | 323 | * Only applicable to 64-bit x86. |
301 | */ | 324 | */ |
302 | static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) | 325 | int __kprobes __copy_instruction(u8 *dest, u8 *src) |
303 | { | 326 | { |
304 | struct insn insn; | 327 | struct insn insn; |
305 | int ret; | ||
306 | kprobe_opcode_t buf[MAX_INSN_SIZE]; | 328 | kprobe_opcode_t buf[MAX_INSN_SIZE]; |
307 | 329 | ||
308 | kernel_insn_init(&insn, src); | 330 | kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src)); |
309 | if (recover) { | ||
310 | insn_get_opcode(&insn); | ||
311 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { | ||
312 | ret = recover_probed_instruction(buf, | ||
313 | (unsigned long)src); | ||
314 | if (ret) | ||
315 | return 0; | ||
316 | kernel_insn_init(&insn, buf); | ||
317 | } | ||
318 | } | ||
319 | insn_get_length(&insn); | 331 | insn_get_length(&insn); |
332 | /* Another subsystem puts a breakpoint, failed to recover */ | ||
333 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) | ||
334 | return 0; | ||
320 | memcpy(dest, insn.kaddr, insn.length); | 335 | memcpy(dest, insn.kaddr, insn.length); |
321 | 336 | ||
322 | #ifdef CONFIG_X86_64 | 337 | #ifdef CONFIG_X86_64 |
@@ -337,8 +352,7 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) | |||
337 | * extension of the original signed 32-bit displacement would | 352 | * extension of the original signed 32-bit displacement would |
338 | * have given. | 353 | * have given. |
339 | */ | 354 | */ |
340 | newdisp = (u8 *) src + (s64) insn.displacement.value - | 355 | newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest; |
341 | (u8 *) dest; | ||
342 | BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ | 356 | BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ |
343 | disp = (u8 *) dest + insn_offset_displacement(&insn); | 357 | disp = (u8 *) dest + insn_offset_displacement(&insn); |
344 | *(s32 *) disp = (s32) newdisp; | 358 | *(s32 *) disp = (s32) newdisp; |
@@ -349,18 +363,20 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) | |||
349 | 363 | ||
350 | static void __kprobes arch_copy_kprobe(struct kprobe *p) | 364 | static void __kprobes arch_copy_kprobe(struct kprobe *p) |
351 | { | 365 | { |
366 | /* Copy an instruction with recovering if other optprobe modifies it.*/ | ||
367 | __copy_instruction(p->ainsn.insn, p->addr); | ||
368 | |||
352 | /* | 369 | /* |
353 | * Copy an instruction without recovering int3, because it will be | 370 | * __copy_instruction can modify the displacement of the instruction, |
354 | * put by another subsystem. | 371 | * but it doesn't affect boostable check. |
355 | */ | 372 | */ |
356 | __copy_instruction(p->ainsn.insn, p->addr, 0); | 373 | if (can_boost(p->ainsn.insn)) |
357 | |||
358 | if (can_boost(p->addr)) | ||
359 | p->ainsn.boostable = 0; | 374 | p->ainsn.boostable = 0; |
360 | else | 375 | else |
361 | p->ainsn.boostable = -1; | 376 | p->ainsn.boostable = -1; |
362 | 377 | ||
363 | p->opcode = *p->addr; | 378 | /* Also, displacement change doesn't affect the first byte */ |
379 | p->opcode = p->ainsn.insn[0]; | ||
364 | } | 380 | } |
365 | 381 | ||
366 | int __kprobes arch_prepare_kprobe(struct kprobe *p) | 382 | int __kprobes arch_prepare_kprobe(struct kprobe *p) |
@@ -442,8 +458,8 @@ static void __kprobes restore_btf(void) | |||
442 | } | 458 | } |
443 | } | 459 | } |
444 | 460 | ||
445 | void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, | 461 | void __kprobes |
446 | struct pt_regs *regs) | 462 | arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) |
447 | { | 463 | { |
448 | unsigned long *sara = stack_addr(regs); | 464 | unsigned long *sara = stack_addr(regs); |
449 | 465 | ||
@@ -453,16 +469,8 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, | |||
453 | *sara = (unsigned long) &kretprobe_trampoline; | 469 | *sara = (unsigned long) &kretprobe_trampoline; |
454 | } | 470 | } |
455 | 471 | ||
456 | #ifdef CONFIG_OPTPROBES | 472 | static void __kprobes |
457 | static int __kprobes setup_detour_execution(struct kprobe *p, | 473 | setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter) |
458 | struct pt_regs *regs, | ||
459 | int reenter); | ||
460 | #else | ||
461 | #define setup_detour_execution(p, regs, reenter) (0) | ||
462 | #endif | ||
463 | |||
464 | static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, | ||
465 | struct kprobe_ctlblk *kcb, int reenter) | ||
466 | { | 474 | { |
467 | if (setup_detour_execution(p, regs, reenter)) | 475 | if (setup_detour_execution(p, regs, reenter)) |
468 | return; | 476 | return; |
@@ -504,8 +512,8 @@ static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, | |||
504 | * within the handler. We save the original kprobes variables and just single | 512 | * within the handler. We save the original kprobes variables and just single |
505 | * step on the instruction of the new probe without calling any user handlers. | 513 | * step on the instruction of the new probe without calling any user handlers. |
506 | */ | 514 | */ |
507 | static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, | 515 | static int __kprobes |
508 | struct kprobe_ctlblk *kcb) | 516 | reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) |
509 | { | 517 | { |
510 | switch (kcb->kprobe_status) { | 518 | switch (kcb->kprobe_status) { |
511 | case KPROBE_HIT_SSDONE: | 519 | case KPROBE_HIT_SSDONE: |
@@ -600,69 +608,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) | |||
600 | return 0; | 608 | return 0; |
601 | } | 609 | } |
602 | 610 | ||
603 | #ifdef CONFIG_X86_64 | ||
604 | #define SAVE_REGS_STRING \ | ||
605 | /* Skip cs, ip, orig_ax. */ \ | ||
606 | " subq $24, %rsp\n" \ | ||
607 | " pushq %rdi\n" \ | ||
608 | " pushq %rsi\n" \ | ||
609 | " pushq %rdx\n" \ | ||
610 | " pushq %rcx\n" \ | ||
611 | " pushq %rax\n" \ | ||
612 | " pushq %r8\n" \ | ||
613 | " pushq %r9\n" \ | ||
614 | " pushq %r10\n" \ | ||
615 | " pushq %r11\n" \ | ||
616 | " pushq %rbx\n" \ | ||
617 | " pushq %rbp\n" \ | ||
618 | " pushq %r12\n" \ | ||
619 | " pushq %r13\n" \ | ||
620 | " pushq %r14\n" \ | ||
621 | " pushq %r15\n" | ||
622 | #define RESTORE_REGS_STRING \ | ||
623 | " popq %r15\n" \ | ||
624 | " popq %r14\n" \ | ||
625 | " popq %r13\n" \ | ||
626 | " popq %r12\n" \ | ||
627 | " popq %rbp\n" \ | ||
628 | " popq %rbx\n" \ | ||
629 | " popq %r11\n" \ | ||
630 | " popq %r10\n" \ | ||
631 | " popq %r9\n" \ | ||
632 | " popq %r8\n" \ | ||
633 | " popq %rax\n" \ | ||
634 | " popq %rcx\n" \ | ||
635 | " popq %rdx\n" \ | ||
636 | " popq %rsi\n" \ | ||
637 | " popq %rdi\n" \ | ||
638 | /* Skip orig_ax, ip, cs */ \ | ||
639 | " addq $24, %rsp\n" | ||
640 | #else | ||
641 | #define SAVE_REGS_STRING \ | ||
642 | /* Skip cs, ip, orig_ax and gs. */ \ | ||
643 | " subl $16, %esp\n" \ | ||
644 | " pushl %fs\n" \ | ||
645 | " pushl %es\n" \ | ||
646 | " pushl %ds\n" \ | ||
647 | " pushl %eax\n" \ | ||
648 | " pushl %ebp\n" \ | ||
649 | " pushl %edi\n" \ | ||
650 | " pushl %esi\n" \ | ||
651 | " pushl %edx\n" \ | ||
652 | " pushl %ecx\n" \ | ||
653 | " pushl %ebx\n" | ||
654 | #define RESTORE_REGS_STRING \ | ||
655 | " popl %ebx\n" \ | ||
656 | " popl %ecx\n" \ | ||
657 | " popl %edx\n" \ | ||
658 | " popl %esi\n" \ | ||
659 | " popl %edi\n" \ | ||
660 | " popl %ebp\n" \ | ||
661 | " popl %eax\n" \ | ||
662 | /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ | ||
663 | " addl $24, %esp\n" | ||
664 | #endif | ||
665 | |||
666 | /* | 611 | /* |
667 | * When a retprobed function returns, this code saves registers and | 612 | * When a retprobed function returns, this code saves registers and |
668 | * calls trampoline_handler() runs, which calls the kretprobe's handler. | 613 | * calls trampoline_handler() runs, which calls the kretprobe's handler. |
@@ -816,8 +761,8 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) | |||
816 | * jump instruction after the copied instruction, that jumps to the next | 761 | * jump instruction after the copied instruction, that jumps to the next |
817 | * instruction after the probepoint. | 762 | * instruction after the probepoint. |
818 | */ | 763 | */ |
819 | static void __kprobes resume_execution(struct kprobe *p, | 764 | static void __kprobes |
820 | struct pt_regs *regs, struct kprobe_ctlblk *kcb) | 765 | resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) |
821 | { | 766 | { |
822 | unsigned long *tos = stack_addr(regs); | 767 | unsigned long *tos = stack_addr(regs); |
823 | unsigned long copy_ip = (unsigned long)p->ainsn.insn; | 768 | unsigned long copy_ip = (unsigned long)p->ainsn.insn; |
@@ -996,8 +941,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) | |||
996 | /* | 941 | /* |
997 | * Wrapper routine for handling exceptions. | 942 | * Wrapper routine for handling exceptions. |
998 | */ | 943 | */ |
999 | int __kprobes kprobe_exceptions_notify(struct notifier_block *self, | 944 | int __kprobes |
1000 | unsigned long val, void *data) | 945 | kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) |
1001 | { | 946 | { |
1002 | struct die_args *args = data; | 947 | struct die_args *args = data; |
1003 | int ret = NOTIFY_DONE; | 948 | int ret = NOTIFY_DONE; |
@@ -1107,466 +1052,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) | |||
1107 | return 0; | 1052 | return 0; |
1108 | } | 1053 | } |
1109 | 1054 | ||
1110 | |||
1111 | #ifdef CONFIG_OPTPROBES | ||
1112 | |||
1113 | /* Insert a call instruction at address 'from', which calls address 'to'.*/ | ||
1114 | static void __kprobes synthesize_relcall(void *from, void *to) | ||
1115 | { | ||
1116 | __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); | ||
1117 | } | ||
1118 | |||
1119 | /* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ | ||
1120 | static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, | ||
1121 | unsigned long val) | ||
1122 | { | ||
1123 | #ifdef CONFIG_X86_64 | ||
1124 | *addr++ = 0x48; | ||
1125 | *addr++ = 0xbf; | ||
1126 | #else | ||
1127 | *addr++ = 0xb8; | ||
1128 | #endif | ||
1129 | *(unsigned long *)addr = val; | ||
1130 | } | ||
1131 | |||
1132 | static void __used __kprobes kprobes_optinsn_template_holder(void) | ||
1133 | { | ||
1134 | asm volatile ( | ||
1135 | ".global optprobe_template_entry\n" | ||
1136 | "optprobe_template_entry: \n" | ||
1137 | #ifdef CONFIG_X86_64 | ||
1138 | /* We don't bother saving the ss register */ | ||
1139 | " pushq %rsp\n" | ||
1140 | " pushfq\n" | ||
1141 | SAVE_REGS_STRING | ||
1142 | " movq %rsp, %rsi\n" | ||
1143 | ".global optprobe_template_val\n" | ||
1144 | "optprobe_template_val: \n" | ||
1145 | ASM_NOP5 | ||
1146 | ASM_NOP5 | ||
1147 | ".global optprobe_template_call\n" | ||
1148 | "optprobe_template_call: \n" | ||
1149 | ASM_NOP5 | ||
1150 | /* Move flags to rsp */ | ||
1151 | " movq 144(%rsp), %rdx\n" | ||
1152 | " movq %rdx, 152(%rsp)\n" | ||
1153 | RESTORE_REGS_STRING | ||
1154 | /* Skip flags entry */ | ||
1155 | " addq $8, %rsp\n" | ||
1156 | " popfq\n" | ||
1157 | #else /* CONFIG_X86_32 */ | ||
1158 | " pushf\n" | ||
1159 | SAVE_REGS_STRING | ||
1160 | " movl %esp, %edx\n" | ||
1161 | ".global optprobe_template_val\n" | ||
1162 | "optprobe_template_val: \n" | ||
1163 | ASM_NOP5 | ||
1164 | ".global optprobe_template_call\n" | ||
1165 | "optprobe_template_call: \n" | ||
1166 | ASM_NOP5 | ||
1167 | RESTORE_REGS_STRING | ||
1168 | " addl $4, %esp\n" /* skip cs */ | ||
1169 | " popf\n" | ||
1170 | #endif | ||
1171 | ".global optprobe_template_end\n" | ||
1172 | "optprobe_template_end: \n"); | ||
1173 | } | ||
1174 | |||
1175 | #define TMPL_MOVE_IDX \ | ||
1176 | ((long)&optprobe_template_val - (long)&optprobe_template_entry) | ||
1177 | #define TMPL_CALL_IDX \ | ||
1178 | ((long)&optprobe_template_call - (long)&optprobe_template_entry) | ||
1179 | #define TMPL_END_IDX \ | ||
1180 | ((long)&optprobe_template_end - (long)&optprobe_template_entry) | ||
1181 | |||
1182 | #define INT3_SIZE sizeof(kprobe_opcode_t) | ||
1183 | |||
1184 | /* Optimized kprobe call back function: called from optinsn */ | ||
1185 | static void __kprobes optimized_callback(struct optimized_kprobe *op, | ||
1186 | struct pt_regs *regs) | ||
1187 | { | ||
1188 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | ||
1189 | unsigned long flags; | ||
1190 | |||
1191 | /* This is possible if op is under delayed unoptimizing */ | ||
1192 | if (kprobe_disabled(&op->kp)) | ||
1193 | return; | ||
1194 | |||
1195 | local_irq_save(flags); | ||
1196 | if (kprobe_running()) { | ||
1197 | kprobes_inc_nmissed_count(&op->kp); | ||
1198 | } else { | ||
1199 | /* Save skipped registers */ | ||
1200 | #ifdef CONFIG_X86_64 | ||
1201 | regs->cs = __KERNEL_CS; | ||
1202 | #else | ||
1203 | regs->cs = __KERNEL_CS | get_kernel_rpl(); | ||
1204 | regs->gs = 0; | ||
1205 | #endif | ||
1206 | regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; | ||
1207 | regs->orig_ax = ~0UL; | ||
1208 | |||
1209 | __this_cpu_write(current_kprobe, &op->kp); | ||
1210 | kcb->kprobe_status = KPROBE_HIT_ACTIVE; | ||
1211 | opt_pre_handler(&op->kp, regs); | ||
1212 | __this_cpu_write(current_kprobe, NULL); | ||
1213 | } | ||
1214 | local_irq_restore(flags); | ||
1215 | } | ||
1216 | |||
1217 | static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) | ||
1218 | { | ||
1219 | int len = 0, ret; | ||
1220 | |||
1221 | while (len < RELATIVEJUMP_SIZE) { | ||
1222 | ret = __copy_instruction(dest + len, src + len, 1); | ||
1223 | if (!ret || !can_boost(dest + len)) | ||
1224 | return -EINVAL; | ||
1225 | len += ret; | ||
1226 | } | ||
1227 | /* Check whether the address range is reserved */ | ||
1228 | if (ftrace_text_reserved(src, src + len - 1) || | ||
1229 | alternatives_text_reserved(src, src + len - 1) || | ||
1230 | jump_label_text_reserved(src, src + len - 1)) | ||
1231 | return -EBUSY; | ||
1232 | |||
1233 | return len; | ||
1234 | } | ||
1235 | |||
1236 | /* Check whether insn is indirect jump */ | ||
1237 | static int __kprobes insn_is_indirect_jump(struct insn *insn) | ||
1238 | { | ||
1239 | return ((insn->opcode.bytes[0] == 0xff && | ||
1240 | (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ | ||
1241 | insn->opcode.bytes[0] == 0xea); /* Segment based jump */ | ||
1242 | } | ||
1243 | |||
1244 | /* Check whether insn jumps into specified address range */ | ||
1245 | static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) | ||
1246 | { | ||
1247 | unsigned long target = 0; | ||
1248 | |||
1249 | switch (insn->opcode.bytes[0]) { | ||
1250 | case 0xe0: /* loopne */ | ||
1251 | case 0xe1: /* loope */ | ||
1252 | case 0xe2: /* loop */ | ||
1253 | case 0xe3: /* jcxz */ | ||
1254 | case 0xe9: /* near relative jump */ | ||
1255 | case 0xeb: /* short relative jump */ | ||
1256 | break; | ||
1257 | case 0x0f: | ||
1258 | if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ | ||
1259 | break; | ||
1260 | return 0; | ||
1261 | default: | ||
1262 | if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ | ||
1263 | break; | ||
1264 | return 0; | ||
1265 | } | ||
1266 | target = (unsigned long)insn->next_byte + insn->immediate.value; | ||
1267 | |||
1268 | return (start <= target && target <= start + len); | ||
1269 | } | ||
1270 | |||
1271 | /* Decode whole function to ensure any instructions don't jump into target */ | ||
1272 | static int __kprobes can_optimize(unsigned long paddr) | ||
1273 | { | ||
1274 | int ret; | ||
1275 | unsigned long addr, size = 0, offset = 0; | ||
1276 | struct insn insn; | ||
1277 | kprobe_opcode_t buf[MAX_INSN_SIZE]; | ||
1278 | |||
1279 | /* Lookup symbol including addr */ | ||
1280 | if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) | ||
1281 | return 0; | ||
1282 | |||
1283 | /* | ||
1284 | * Do not optimize in the entry code due to the unstable | ||
1285 | * stack handling. | ||
1286 | */ | ||
1287 | if ((paddr >= (unsigned long )__entry_text_start) && | ||
1288 | (paddr < (unsigned long )__entry_text_end)) | ||
1289 | return 0; | ||
1290 | |||
1291 | /* Check there is enough space for a relative jump. */ | ||
1292 | if (size - offset < RELATIVEJUMP_SIZE) | ||
1293 | return 0; | ||
1294 | |||
1295 | /* Decode instructions */ | ||
1296 | addr = paddr - offset; | ||
1297 | while (addr < paddr - offset + size) { /* Decode until function end */ | ||
1298 | if (search_exception_tables(addr)) | ||
1299 | /* | ||
1300 | * Since some fixup code will jumps into this function, | ||
1301 | * we can't optimize kprobe in this function. | ||
1302 | */ | ||
1303 | return 0; | ||
1304 | kernel_insn_init(&insn, (void *)addr); | ||
1305 | insn_get_opcode(&insn); | ||
1306 | if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { | ||
1307 | ret = recover_probed_instruction(buf, addr); | ||
1308 | if (ret) | ||
1309 | return 0; | ||
1310 | kernel_insn_init(&insn, buf); | ||
1311 | } | ||
1312 | insn_get_length(&insn); | ||
1313 | /* Recover address */ | ||
1314 | insn.kaddr = (void *)addr; | ||
1315 | insn.next_byte = (void *)(addr + insn.length); | ||
1316 | /* Check any instructions don't jump into target */ | ||
1317 | if (insn_is_indirect_jump(&insn) || | ||
1318 | insn_jump_into_range(&insn, paddr + INT3_SIZE, | ||
1319 | RELATIVE_ADDR_SIZE)) | ||
1320 | return 0; | ||
1321 | addr += insn.length; | ||
1322 | } | ||
1323 | |||
1324 | return 1; | ||
1325 | } | ||
1326 | |||
1327 | /* Check optimized_kprobe can actually be optimized. */ | ||
1328 | int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) | ||
1329 | { | ||
1330 | int i; | ||
1331 | struct kprobe *p; | ||
1332 | |||
1333 | for (i = 1; i < op->optinsn.size; i++) { | ||
1334 | p = get_kprobe(op->kp.addr + i); | ||
1335 | if (p && !kprobe_disabled(p)) | ||
1336 | return -EEXIST; | ||
1337 | } | ||
1338 | |||
1339 | return 0; | ||
1340 | } | ||
1341 | |||
1342 | /* Check the addr is within the optimized instructions. */ | ||
1343 | int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op, | ||
1344 | unsigned long addr) | ||
1345 | { | ||
1346 | return ((unsigned long)op->kp.addr <= addr && | ||
1347 | (unsigned long)op->kp.addr + op->optinsn.size > addr); | ||
1348 | } | ||
1349 | |||
1350 | /* Free optimized instruction slot */ | ||
1351 | static __kprobes | ||
1352 | void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) | ||
1353 | { | ||
1354 | if (op->optinsn.insn) { | ||
1355 | free_optinsn_slot(op->optinsn.insn, dirty); | ||
1356 | op->optinsn.insn = NULL; | ||
1357 | op->optinsn.size = 0; | ||
1358 | } | ||
1359 | } | ||
1360 | |||
1361 | void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) | ||
1362 | { | ||
1363 | __arch_remove_optimized_kprobe(op, 1); | ||
1364 | } | ||
1365 | |||
1366 | /* | ||
1367 | * Copy replacing target instructions | ||
1368 | * Target instructions MUST be relocatable (checked inside) | ||
1369 | */ | ||
1370 | int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) | ||
1371 | { | ||
1372 | u8 *buf; | ||
1373 | int ret; | ||
1374 | long rel; | ||
1375 | |||
1376 | if (!can_optimize((unsigned long)op->kp.addr)) | ||
1377 | return -EILSEQ; | ||
1378 | |||
1379 | op->optinsn.insn = get_optinsn_slot(); | ||
1380 | if (!op->optinsn.insn) | ||
1381 | return -ENOMEM; | ||
1382 | |||
1383 | /* | ||
1384 | * Verify if the address gap is in 2GB range, because this uses | ||
1385 | * a relative jump. | ||
1386 | */ | ||
1387 | rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; | ||
1388 | if (abs(rel) > 0x7fffffff) | ||
1389 | return -ERANGE; | ||
1390 | |||
1391 | buf = (u8 *)op->optinsn.insn; | ||
1392 | |||
1393 | /* Copy instructions into the out-of-line buffer */ | ||
1394 | ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); | ||
1395 | if (ret < 0) { | ||
1396 | __arch_remove_optimized_kprobe(op, 0); | ||
1397 | return ret; | ||
1398 | } | ||
1399 | op->optinsn.size = ret; | ||
1400 | |||
1401 | /* Copy arch-dep-instance from template */ | ||
1402 | memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); | ||
1403 | |||
1404 | /* Set probe information */ | ||
1405 | synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); | ||
1406 | |||
1407 | /* Set probe function call */ | ||
1408 | synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); | ||
1409 | |||
1410 | /* Set returning jmp instruction at the tail of out-of-line buffer */ | ||
1411 | synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, | ||
1412 | (u8 *)op->kp.addr + op->optinsn.size); | ||
1413 | |||
1414 | flush_icache_range((unsigned long) buf, | ||
1415 | (unsigned long) buf + TMPL_END_IDX + | ||
1416 | op->optinsn.size + RELATIVEJUMP_SIZE); | ||
1417 | return 0; | ||
1418 | } | ||
1419 | |||
1420 | #define MAX_OPTIMIZE_PROBES 256 | ||
1421 | static struct text_poke_param *jump_poke_params; | ||
1422 | static struct jump_poke_buffer { | ||
1423 | u8 buf[RELATIVEJUMP_SIZE]; | ||
1424 | } *jump_poke_bufs; | ||
1425 | |||
1426 | static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, | ||
1427 | u8 *insn_buf, | ||
1428 | struct optimized_kprobe *op) | ||
1429 | { | ||
1430 | s32 rel = (s32)((long)op->optinsn.insn - | ||
1431 | ((long)op->kp.addr + RELATIVEJUMP_SIZE)); | ||
1432 | |||
1433 | /* Backup instructions which will be replaced by jump address */ | ||
1434 | memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, | ||
1435 | RELATIVE_ADDR_SIZE); | ||
1436 | |||
1437 | insn_buf[0] = RELATIVEJUMP_OPCODE; | ||
1438 | *(s32 *)(&insn_buf[1]) = rel; | ||
1439 | |||
1440 | tprm->addr = op->kp.addr; | ||
1441 | tprm->opcode = insn_buf; | ||
1442 | tprm->len = RELATIVEJUMP_SIZE; | ||
1443 | } | ||
1444 | |||
1445 | /* | ||
1446 | * Replace breakpoints (int3) with relative jumps. | ||
1447 | * Caller must call with locking kprobe_mutex and text_mutex. | ||
1448 | */ | ||
1449 | void __kprobes arch_optimize_kprobes(struct list_head *oplist) | ||
1450 | { | ||
1451 | struct optimized_kprobe *op, *tmp; | ||
1452 | int c = 0; | ||
1453 | |||
1454 | list_for_each_entry_safe(op, tmp, oplist, list) { | ||
1455 | WARN_ON(kprobe_disabled(&op->kp)); | ||
1456 | /* Setup param */ | ||
1457 | setup_optimize_kprobe(&jump_poke_params[c], | ||
1458 | jump_poke_bufs[c].buf, op); | ||
1459 | list_del_init(&op->list); | ||
1460 | if (++c >= MAX_OPTIMIZE_PROBES) | ||
1461 | break; | ||
1462 | } | ||
1463 | |||
1464 | /* | ||
1465 | * text_poke_smp doesn't support NMI/MCE code modifying. | ||
1466 | * However, since kprobes itself also doesn't support NMI/MCE | ||
1467 | * code probing, it's not a problem. | ||
1468 | */ | ||
1469 | text_poke_smp_batch(jump_poke_params, c); | ||
1470 | } | ||
1471 | |||
1472 | static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, | ||
1473 | u8 *insn_buf, | ||
1474 | struct optimized_kprobe *op) | ||
1475 | { | ||
1476 | /* Set int3 to first byte for kprobes */ | ||
1477 | insn_buf[0] = BREAKPOINT_INSTRUCTION; | ||
1478 | memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); | ||
1479 | |||
1480 | tprm->addr = op->kp.addr; | ||
1481 | tprm->opcode = insn_buf; | ||
1482 | tprm->len = RELATIVEJUMP_SIZE; | ||
1483 | } | ||
1484 | |||
1485 | /* | ||
1486 | * Recover original instructions and breakpoints from relative jumps. | ||
1487 | * Caller must call with locking kprobe_mutex. | ||
1488 | */ | ||
1489 | extern void arch_unoptimize_kprobes(struct list_head *oplist, | ||
1490 | struct list_head *done_list) | ||
1491 | { | ||
1492 | struct optimized_kprobe *op, *tmp; | ||
1493 | int c = 0; | ||
1494 | |||
1495 | list_for_each_entry_safe(op, tmp, oplist, list) { | ||
1496 | /* Setup param */ | ||
1497 | setup_unoptimize_kprobe(&jump_poke_params[c], | ||
1498 | jump_poke_bufs[c].buf, op); | ||
1499 | list_move(&op->list, done_list); | ||
1500 | if (++c >= MAX_OPTIMIZE_PROBES) | ||
1501 | break; | ||
1502 | } | ||
1503 | |||
1504 | /* | ||
1505 | * text_poke_smp doesn't support NMI/MCE code modifying. | ||
1506 | * However, since kprobes itself also doesn't support NMI/MCE | ||
1507 | * code probing, it's not a problem. | ||
1508 | */ | ||
1509 | text_poke_smp_batch(jump_poke_params, c); | ||
1510 | } | ||
1511 | |||
1512 | /* Replace a relative jump with a breakpoint (int3). */ | ||
1513 | void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) | ||
1514 | { | ||
1515 | u8 buf[RELATIVEJUMP_SIZE]; | ||
1516 | |||
1517 | /* Set int3 to first byte for kprobes */ | ||
1518 | buf[0] = BREAKPOINT_INSTRUCTION; | ||
1519 | memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); | ||
1520 | text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); | ||
1521 | } | ||
1522 | |||
1523 | static int __kprobes setup_detour_execution(struct kprobe *p, | ||
1524 | struct pt_regs *regs, | ||
1525 | int reenter) | ||
1526 | { | ||
1527 | struct optimized_kprobe *op; | ||
1528 | |||
1529 | if (p->flags & KPROBE_FLAG_OPTIMIZED) { | ||
1530 | /* This kprobe is really able to run optimized path. */ | ||
1531 | op = container_of(p, struct optimized_kprobe, kp); | ||
1532 | /* Detour through copied instructions */ | ||
1533 | regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; | ||
1534 | if (!reenter) | ||
1535 | reset_current_kprobe(); | ||
1536 | preempt_enable_no_resched(); | ||
1537 | return 1; | ||
1538 | } | ||
1539 | return 0; | ||
1540 | } | ||
1541 | |||
1542 | static int __kprobes init_poke_params(void) | ||
1543 | { | ||
1544 | /* Allocate code buffer and parameter array */ | ||
1545 | jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * | ||
1546 | MAX_OPTIMIZE_PROBES, GFP_KERNEL); | ||
1547 | if (!jump_poke_bufs) | ||
1548 | return -ENOMEM; | ||
1549 | |||
1550 | jump_poke_params = kmalloc(sizeof(struct text_poke_param) * | ||
1551 | MAX_OPTIMIZE_PROBES, GFP_KERNEL); | ||
1552 | if (!jump_poke_params) { | ||
1553 | kfree(jump_poke_bufs); | ||
1554 | jump_poke_bufs = NULL; | ||
1555 | return -ENOMEM; | ||
1556 | } | ||
1557 | |||
1558 | return 0; | ||
1559 | } | ||
1560 | #else /* !CONFIG_OPTPROBES */ | ||
1561 | static int __kprobes init_poke_params(void) | ||
1562 | { | ||
1563 | return 0; | ||
1564 | } | ||
1565 | #endif | ||
1566 | |||
1567 | int __init arch_init_kprobes(void) | 1055 | int __init arch_init_kprobes(void) |
1568 | { | 1056 | { |
1569 | return init_poke_params(); | 1057 | return arch_init_optprobes(); |
1570 | } | 1058 | } |
1571 | 1059 | ||
1572 | int __kprobes arch_trampoline_kprobe(struct kprobe *p) | 1060 | int __kprobes arch_trampoline_kprobe(struct kprobe *p) |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f0c6fd6f176b..694d801bf606 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -438,9 +438,9 @@ void __init kvm_guest_init(void) | |||
438 | static __init int activate_jump_labels(void) | 438 | static __init int activate_jump_labels(void) |
439 | { | 439 | { |
440 | if (has_steal_clock) { | 440 | if (has_steal_clock) { |
441 | jump_label_inc(¶virt_steal_enabled); | 441 | static_key_slow_inc(¶virt_steal_enabled); |
442 | if (steal_acc) | 442 | if (steal_acc) |
443 | jump_label_inc(¶virt_steal_rq_enabled); | 443 | static_key_slow_inc(¶virt_steal_rq_enabled); |
444 | } | 444 | } |
445 | 445 | ||
446 | return 0; | 446 | return 0; |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 44842d756b29..f8492da65bfc 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -136,6 +136,15 @@ int kvm_register_clock(char *txt) | |||
136 | return ret; | 136 | return ret; |
137 | } | 137 | } |
138 | 138 | ||
139 | static void kvm_save_sched_clock_state(void) | ||
140 | { | ||
141 | } | ||
142 | |||
143 | static void kvm_restore_sched_clock_state(void) | ||
144 | { | ||
145 | kvm_register_clock("primary cpu clock, resume"); | ||
146 | } | ||
147 | |||
139 | #ifdef CONFIG_X86_LOCAL_APIC | 148 | #ifdef CONFIG_X86_LOCAL_APIC |
140 | static void __cpuinit kvm_setup_secondary_clock(void) | 149 | static void __cpuinit kvm_setup_secondary_clock(void) |
141 | { | 150 | { |
@@ -144,8 +153,6 @@ static void __cpuinit kvm_setup_secondary_clock(void) | |||
144 | * we shouldn't fail. | 153 | * we shouldn't fail. |
145 | */ | 154 | */ |
146 | WARN_ON(kvm_register_clock("secondary cpu clock")); | 155 | WARN_ON(kvm_register_clock("secondary cpu clock")); |
147 | /* ok, done with our trickery, call native */ | ||
148 | setup_secondary_APIC_clock(); | ||
149 | } | 156 | } |
150 | #endif | 157 | #endif |
151 | 158 | ||
@@ -194,9 +201,11 @@ void __init kvmclock_init(void) | |||
194 | x86_platform.get_wallclock = kvm_get_wallclock; | 201 | x86_platform.get_wallclock = kvm_get_wallclock; |
195 | x86_platform.set_wallclock = kvm_set_wallclock; | 202 | x86_platform.set_wallclock = kvm_set_wallclock; |
196 | #ifdef CONFIG_X86_LOCAL_APIC | 203 | #ifdef CONFIG_X86_LOCAL_APIC |
197 | x86_cpuinit.setup_percpu_clockev = | 204 | x86_cpuinit.early_percpu_clock_init = |
198 | kvm_setup_secondary_clock; | 205 | kvm_setup_secondary_clock; |
199 | #endif | 206 | #endif |
207 | x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; | ||
208 | x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; | ||
200 | machine_ops.shutdown = kvm_shutdown; | 209 | machine_ops.shutdown = kvm_shutdown; |
201 | #ifdef CONFIG_KEXEC | 210 | #ifdef CONFIG_KEXEC |
202 | machine_ops.crash_shutdown = kvm_crash_shutdown; | 211 | machine_ops.crash_shutdown = kvm_crash_shutdown; |
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index fe86493f3ed1..73465aab28f8 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c | |||
@@ -311,13 +311,33 @@ out: | |||
311 | return state; | 311 | return state; |
312 | } | 312 | } |
313 | 313 | ||
314 | /* | ||
315 | * AMD microcode firmware naming convention, up to family 15h they are in | ||
316 | * the legacy file: | ||
317 | * | ||
318 | * amd-ucode/microcode_amd.bin | ||
319 | * | ||
320 | * This legacy file is always smaller than 2K in size. | ||
321 | * | ||
322 | * Starting at family 15h they are in family specific firmware files: | ||
323 | * | ||
324 | * amd-ucode/microcode_amd_fam15h.bin | ||
325 | * amd-ucode/microcode_amd_fam16h.bin | ||
326 | * ... | ||
327 | * | ||
328 | * These might be larger than 2K. | ||
329 | */ | ||
314 | static enum ucode_state request_microcode_amd(int cpu, struct device *device) | 330 | static enum ucode_state request_microcode_amd(int cpu, struct device *device) |
315 | { | 331 | { |
316 | const char *fw_name = "amd-ucode/microcode_amd.bin"; | 332 | char fw_name[36] = "amd-ucode/microcode_amd.bin"; |
317 | const struct firmware *fw; | 333 | const struct firmware *fw; |
318 | enum ucode_state ret = UCODE_NFOUND; | 334 | enum ucode_state ret = UCODE_NFOUND; |
335 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
336 | |||
337 | if (c->x86 >= 0x15) | ||
338 | snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); | ||
319 | 339 | ||
320 | if (request_firmware(&fw, fw_name, device)) { | 340 | if (request_firmware(&fw, (const char *)fw_name, device)) { |
321 | pr_err("failed to load file %s\n", fw_name); | 341 | pr_err("failed to load file %s\n", fw_name); |
322 | goto out; | 342 | goto out; |
323 | } | 343 | } |
@@ -340,7 +360,6 @@ out: | |||
340 | static enum ucode_state | 360 | static enum ucode_state |
341 | request_microcode_user(int cpu, const void __user *buf, size_t size) | 361 | request_microcode_user(int cpu, const void __user *buf, size_t size) |
342 | { | 362 | { |
343 | pr_info("AMD microcode update via /dev/cpu/microcode not supported\n"); | ||
344 | return UCODE_ERROR; | 363 | return UCODE_ERROR; |
345 | } | 364 | } |
346 | 365 | ||
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index fda91c307104..87a0f8688301 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c | |||
@@ -86,6 +86,7 @@ | |||
86 | 86 | ||
87 | #include <asm/microcode.h> | 87 | #include <asm/microcode.h> |
88 | #include <asm/processor.h> | 88 | #include <asm/processor.h> |
89 | #include <asm/cpu_device_id.h> | ||
89 | 90 | ||
90 | MODULE_DESCRIPTION("Microcode Update Driver"); | 91 | MODULE_DESCRIPTION("Microcode Update Driver"); |
91 | MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); | 92 | MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); |
@@ -504,6 +505,20 @@ static struct notifier_block __refdata mc_cpu_notifier = { | |||
504 | .notifier_call = mc_cpu_callback, | 505 | .notifier_call = mc_cpu_callback, |
505 | }; | 506 | }; |
506 | 507 | ||
508 | #ifdef MODULE | ||
509 | /* Autoload on Intel and AMD systems */ | ||
510 | static const struct x86_cpu_id microcode_id[] = { | ||
511 | #ifdef CONFIG_MICROCODE_INTEL | ||
512 | { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, | ||
513 | #endif | ||
514 | #ifdef CONFIG_MICROCODE_AMD | ||
515 | { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, }, | ||
516 | #endif | ||
517 | {} | ||
518 | }; | ||
519 | MODULE_DEVICE_TABLE(x86cpu, microcode_id); | ||
520 | #endif | ||
521 | |||
507 | static int __init microcode_init(void) | 522 | static int __init microcode_init(void) |
508 | { | 523 | { |
509 | struct cpuinfo_x86 *c = &cpu_data(0); | 524 | struct cpuinfo_x86 *c = &cpu_data(0); |
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 0d01a8ea4e11..2c39dcd510fa 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/smp.h> | 12 | #include <linux/smp.h> |
13 | #include <linux/cpumask.h> | 13 | #include <linux/cpumask.h> |
14 | #include <linux/delay.h> | 14 | #include <linux/delay.h> |
15 | #include <linux/init.h> | ||
15 | 16 | ||
16 | #include <asm/apic.h> | 17 | #include <asm/apic.h> |
17 | #include <asm/nmi.h> | 18 | #include <asm/nmi.h> |
@@ -20,35 +21,35 @@ | |||
20 | #define FAILURE 1 | 21 | #define FAILURE 1 |
21 | #define TIMEOUT 2 | 22 | #define TIMEOUT 2 |
22 | 23 | ||
23 | static int nmi_fail; | 24 | static int __initdata nmi_fail; |
24 | 25 | ||
25 | /* check to see if NMI IPIs work on this machine */ | 26 | /* check to see if NMI IPIs work on this machine */ |
26 | static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly; | 27 | static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata; |
27 | 28 | ||
28 | static int testcase_total; | 29 | static int __initdata testcase_total; |
29 | static int testcase_successes; | 30 | static int __initdata testcase_successes; |
30 | static int expected_testcase_failures; | 31 | static int __initdata expected_testcase_failures; |
31 | static int unexpected_testcase_failures; | 32 | static int __initdata unexpected_testcase_failures; |
32 | static int unexpected_testcase_unknowns; | 33 | static int __initdata unexpected_testcase_unknowns; |
33 | 34 | ||
34 | static int nmi_unk_cb(unsigned int val, struct pt_regs *regs) | 35 | static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs) |
35 | { | 36 | { |
36 | unexpected_testcase_unknowns++; | 37 | unexpected_testcase_unknowns++; |
37 | return NMI_HANDLED; | 38 | return NMI_HANDLED; |
38 | } | 39 | } |
39 | 40 | ||
40 | static void init_nmi_testsuite(void) | 41 | static void __init init_nmi_testsuite(void) |
41 | { | 42 | { |
42 | /* trap all the unknown NMIs we may generate */ | 43 | /* trap all the unknown NMIs we may generate */ |
43 | register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); | 44 | register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); |
44 | } | 45 | } |
45 | 46 | ||
46 | static void cleanup_nmi_testsuite(void) | 47 | static void __init cleanup_nmi_testsuite(void) |
47 | { | 48 | { |
48 | unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk"); | 49 | unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk"); |
49 | } | 50 | } |
50 | 51 | ||
51 | static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) | 52 | static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) |
52 | { | 53 | { |
53 | int cpu = raw_smp_processor_id(); | 54 | int cpu = raw_smp_processor_id(); |
54 | 55 | ||
@@ -58,7 +59,7 @@ static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) | |||
58 | return NMI_DONE; | 59 | return NMI_DONE; |
59 | } | 60 | } |
60 | 61 | ||
61 | static void test_nmi_ipi(struct cpumask *mask) | 62 | static void __init test_nmi_ipi(struct cpumask *mask) |
62 | { | 63 | { |
63 | unsigned long timeout; | 64 | unsigned long timeout; |
64 | 65 | ||
@@ -86,7 +87,7 @@ static void test_nmi_ipi(struct cpumask *mask) | |||
86 | return; | 87 | return; |
87 | } | 88 | } |
88 | 89 | ||
89 | static void remote_ipi(void) | 90 | static void __init remote_ipi(void) |
90 | { | 91 | { |
91 | cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); | 92 | cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); |
92 | cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); | 93 | cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); |
@@ -94,19 +95,19 @@ static void remote_ipi(void) | |||
94 | test_nmi_ipi(to_cpumask(nmi_ipi_mask)); | 95 | test_nmi_ipi(to_cpumask(nmi_ipi_mask)); |
95 | } | 96 | } |
96 | 97 | ||
97 | static void local_ipi(void) | 98 | static void __init local_ipi(void) |
98 | { | 99 | { |
99 | cpumask_clear(to_cpumask(nmi_ipi_mask)); | 100 | cpumask_clear(to_cpumask(nmi_ipi_mask)); |
100 | cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); | 101 | cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); |
101 | test_nmi_ipi(to_cpumask(nmi_ipi_mask)); | 102 | test_nmi_ipi(to_cpumask(nmi_ipi_mask)); |
102 | } | 103 | } |
103 | 104 | ||
104 | static void reset_nmi(void) | 105 | static void __init reset_nmi(void) |
105 | { | 106 | { |
106 | nmi_fail = 0; | 107 | nmi_fail = 0; |
107 | } | 108 | } |
108 | 109 | ||
109 | static void dotest(void (*testcase_fn)(void), int expected) | 110 | static void __init dotest(void (*testcase_fn)(void), int expected) |
110 | { | 111 | { |
111 | testcase_fn(); | 112 | testcase_fn(); |
112 | /* | 113 | /* |
@@ -131,12 +132,12 @@ static void dotest(void (*testcase_fn)(void), int expected) | |||
131 | reset_nmi(); | 132 | reset_nmi(); |
132 | } | 133 | } |
133 | 134 | ||
134 | static inline void print_testname(const char *testname) | 135 | static inline void __init print_testname(const char *testname) |
135 | { | 136 | { |
136 | printk("%12s:", testname); | 137 | printk("%12s:", testname); |
137 | } | 138 | } |
138 | 139 | ||
139 | void nmi_selftest(void) | 140 | void __init nmi_selftest(void) |
140 | { | 141 | { |
141 | init_nmi_testsuite(); | 142 | init_nmi_testsuite(); |
142 | 143 | ||
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index d90272e6bc40..9c57c02e54f6 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c | |||
@@ -26,6 +26,7 @@ | |||
26 | 26 | ||
27 | #include <asm/bug.h> | 27 | #include <asm/bug.h> |
28 | #include <asm/paravirt.h> | 28 | #include <asm/paravirt.h> |
29 | #include <asm/debugreg.h> | ||
29 | #include <asm/desc.h> | 30 | #include <asm/desc.h> |
30 | #include <asm/setup.h> | 31 | #include <asm/setup.h> |
31 | #include <asm/pgtable.h> | 32 | #include <asm/pgtable.h> |
@@ -202,8 +203,8 @@ static void native_flush_tlb_single(unsigned long addr) | |||
202 | __native_flush_tlb_single(addr); | 203 | __native_flush_tlb_single(addr); |
203 | } | 204 | } |
204 | 205 | ||
205 | struct jump_label_key paravirt_steal_enabled; | 206 | struct static_key paravirt_steal_enabled; |
206 | struct jump_label_key paravirt_steal_rq_enabled; | 207 | struct static_key paravirt_steal_rq_enabled; |
207 | 208 | ||
208 | static u64 native_steal_clock(int cpu) | 209 | static u64 native_steal_clock(int cpu) |
209 | { | 210 | { |
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1c4d769e21ea..28e5e06fcba4 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c | |||
@@ -262,10 +262,11 @@ rootfs_initcall(pci_iommu_init); | |||
262 | 262 | ||
263 | static __devinit void via_no_dac(struct pci_dev *dev) | 263 | static __devinit void via_no_dac(struct pci_dev *dev) |
264 | { | 264 | { |
265 | if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { | 265 | if (forbid_dac == 0) { |
266 | dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); | 266 | dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); |
267 | forbid_dac = 1; | 267 | forbid_dac = 1; |
268 | } | 268 | } |
269 | } | 269 | } |
270 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); | 270 | DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, |
271 | PCI_CLASS_BRIDGE_PCI, 8, via_no_dac); | ||
271 | #endif | 272 | #endif |
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 34e06e84ce31..0bc72e2069e3 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/pci.h> | 12 | #include <linux/pci.h> |
13 | #include <linux/export.h> | 13 | #include <linux/export.h> |
14 | 14 | ||
15 | #include <asm/probe_roms.h> | ||
15 | #include <asm/pci-direct.h> | 16 | #include <asm/pci-direct.h> |
16 | #include <asm/e820.h> | 17 | #include <asm/e820.h> |
17 | #include <asm/mmzone.h> | 18 | #include <asm/mmzone.h> |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 15763af7bfe3..14baf78d5a1f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <asm/idle.h> | 21 | #include <asm/idle.h> |
22 | #include <asm/uaccess.h> | 22 | #include <asm/uaccess.h> |
23 | #include <asm/i387.h> | 23 | #include <asm/i387.h> |
24 | #include <asm/fpu-internal.h> | ||
24 | #include <asm/debugreg.h> | 25 | #include <asm/debugreg.h> |
25 | 26 | ||
26 | struct kmem_cache *task_xstate_cachep; | 27 | struct kmem_cache *task_xstate_cachep; |
@@ -377,8 +378,8 @@ static inline int hlt_use_halt(void) | |||
377 | void default_idle(void) | 378 | void default_idle(void) |
378 | { | 379 | { |
379 | if (hlt_use_halt()) { | 380 | if (hlt_use_halt()) { |
380 | trace_power_start(POWER_CSTATE, 1, smp_processor_id()); | 381 | trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); |
381 | trace_cpu_idle(1, smp_processor_id()); | 382 | trace_cpu_idle_rcuidle(1, smp_processor_id()); |
382 | current_thread_info()->status &= ~TS_POLLING; | 383 | current_thread_info()->status &= ~TS_POLLING; |
383 | /* | 384 | /* |
384 | * TS_POLLING-cleared state must be visible before we | 385 | * TS_POLLING-cleared state must be visible before we |
@@ -391,8 +392,8 @@ void default_idle(void) | |||
391 | else | 392 | else |
392 | local_irq_enable(); | 393 | local_irq_enable(); |
393 | current_thread_info()->status |= TS_POLLING; | 394 | current_thread_info()->status |= TS_POLLING; |
394 | trace_power_end(smp_processor_id()); | 395 | trace_power_end_rcuidle(smp_processor_id()); |
395 | trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); | 396 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
396 | } else { | 397 | } else { |
397 | local_irq_enable(); | 398 | local_irq_enable(); |
398 | /* loop is done by the caller */ | 399 | /* loop is done by the caller */ |
@@ -450,8 +451,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); | |||
450 | static void mwait_idle(void) | 451 | static void mwait_idle(void) |
451 | { | 452 | { |
452 | if (!need_resched()) { | 453 | if (!need_resched()) { |
453 | trace_power_start(POWER_CSTATE, 1, smp_processor_id()); | 454 | trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); |
454 | trace_cpu_idle(1, smp_processor_id()); | 455 | trace_cpu_idle_rcuidle(1, smp_processor_id()); |
455 | if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) | 456 | if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) |
456 | clflush((void *)¤t_thread_info()->flags); | 457 | clflush((void *)¤t_thread_info()->flags); |
457 | 458 | ||
@@ -461,8 +462,8 @@ static void mwait_idle(void) | |||
461 | __sti_mwait(0, 0); | 462 | __sti_mwait(0, 0); |
462 | else | 463 | else |
463 | local_irq_enable(); | 464 | local_irq_enable(); |
464 | trace_power_end(smp_processor_id()); | 465 | trace_power_end_rcuidle(smp_processor_id()); |
465 | trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); | 466 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
466 | } else | 467 | } else |
467 | local_irq_enable(); | 468 | local_irq_enable(); |
468 | } | 469 | } |
@@ -474,13 +475,13 @@ static void mwait_idle(void) | |||
474 | */ | 475 | */ |
475 | static void poll_idle(void) | 476 | static void poll_idle(void) |
476 | { | 477 | { |
477 | trace_power_start(POWER_CSTATE, 0, smp_processor_id()); | 478 | trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id()); |
478 | trace_cpu_idle(0, smp_processor_id()); | 479 | trace_cpu_idle_rcuidle(0, smp_processor_id()); |
479 | local_irq_enable(); | 480 | local_irq_enable(); |
480 | while (!need_resched()) | 481 | while (!need_resched()) |
481 | cpu_relax(); | 482 | cpu_relax(); |
482 | trace_power_end(smp_processor_id()); | 483 | trace_power_end_rcuidle(smp_processor_id()); |
483 | trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); | 484 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
484 | } | 485 | } |
485 | 486 | ||
486 | /* | 487 | /* |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 485204f58cda..9d7d4842bfaf 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -45,6 +45,7 @@ | |||
45 | #include <asm/ldt.h> | 45 | #include <asm/ldt.h> |
46 | #include <asm/processor.h> | 46 | #include <asm/processor.h> |
47 | #include <asm/i387.h> | 47 | #include <asm/i387.h> |
48 | #include <asm/fpu-internal.h> | ||
48 | #include <asm/desc.h> | 49 | #include <asm/desc.h> |
49 | #ifdef CONFIG_MATH_EMULATION | 50 | #ifdef CONFIG_MATH_EMULATION |
50 | #include <asm/math_emu.h> | 51 | #include <asm/math_emu.h> |
@@ -119,9 +120,7 @@ void cpu_idle(void) | |||
119 | } | 120 | } |
120 | rcu_idle_exit(); | 121 | rcu_idle_exit(); |
121 | tick_nohz_idle_exit(); | 122 | tick_nohz_idle_exit(); |
122 | preempt_enable_no_resched(); | 123 | schedule_preempt_disabled(); |
123 | schedule(); | ||
124 | preempt_disable(); | ||
125 | } | 124 | } |
126 | } | 125 | } |
127 | 126 | ||
@@ -214,6 +213,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, | |||
214 | 213 | ||
215 | task_user_gs(p) = get_user_gs(regs); | 214 | task_user_gs(p) = get_user_gs(regs); |
216 | 215 | ||
216 | p->fpu_counter = 0; | ||
217 | p->thread.io_bitmap_ptr = NULL; | 217 | p->thread.io_bitmap_ptr = NULL; |
218 | tsk = current; | 218 | tsk = current; |
219 | err = -ENOMEM; | 219 | err = -ENOMEM; |
@@ -299,22 +299,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
299 | *next = &next_p->thread; | 299 | *next = &next_p->thread; |
300 | int cpu = smp_processor_id(); | 300 | int cpu = smp_processor_id(); |
301 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 301 | struct tss_struct *tss = &per_cpu(init_tss, cpu); |
302 | bool preload_fpu; | 302 | fpu_switch_t fpu; |
303 | 303 | ||
304 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ | 304 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ |
305 | 305 | ||
306 | /* | 306 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); |
307 | * If the task has used fpu the last 5 timeslices, just do a full | ||
308 | * restore of the math state immediately to avoid the trap; the | ||
309 | * chances of needing FPU soon are obviously high now | ||
310 | */ | ||
311 | preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; | ||
312 | |||
313 | __unlazy_fpu(prev_p); | ||
314 | |||
315 | /* we're going to use this soon, after a few expensive things */ | ||
316 | if (preload_fpu) | ||
317 | prefetch(next->fpu.state); | ||
318 | 307 | ||
319 | /* | 308 | /* |
320 | * Reload esp0. | 309 | * Reload esp0. |
@@ -354,11 +343,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
354 | task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) | 343 | task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) |
355 | __switch_to_xtra(prev_p, next_p, tss); | 344 | __switch_to_xtra(prev_p, next_p, tss); |
356 | 345 | ||
357 | /* If we're going to preload the fpu context, make sure clts | ||
358 | is run while we're batching the cpu state updates. */ | ||
359 | if (preload_fpu) | ||
360 | clts(); | ||
361 | |||
362 | /* | 346 | /* |
363 | * Leave lazy mode, flushing any hypercalls made here. | 347 | * Leave lazy mode, flushing any hypercalls made here. |
364 | * This must be done before restoring TLS segments so | 348 | * This must be done before restoring TLS segments so |
@@ -368,15 +352,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
368 | */ | 352 | */ |
369 | arch_end_context_switch(next_p); | 353 | arch_end_context_switch(next_p); |
370 | 354 | ||
371 | if (preload_fpu) | ||
372 | __math_state_restore(); | ||
373 | |||
374 | /* | 355 | /* |
375 | * Restore %gs if needed (which is common) | 356 | * Restore %gs if needed (which is common) |
376 | */ | 357 | */ |
377 | if (prev->gs | next->gs) | 358 | if (prev->gs | next->gs) |
378 | lazy_load_gs(next->gs); | 359 | lazy_load_gs(next->gs); |
379 | 360 | ||
361 | switch_fpu_finish(next_p, fpu); | ||
362 | |||
380 | percpu_write(current_task, next_p); | 363 | percpu_write(current_task, next_p); |
381 | 364 | ||
382 | return prev_p; | 365 | return prev_p; |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9b9fe4a85c87..292da13fc5aa 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -43,6 +43,7 @@ | |||
43 | #include <asm/system.h> | 43 | #include <asm/system.h> |
44 | #include <asm/processor.h> | 44 | #include <asm/processor.h> |
45 | #include <asm/i387.h> | 45 | #include <asm/i387.h> |
46 | #include <asm/fpu-internal.h> | ||
46 | #include <asm/mmu_context.h> | 47 | #include <asm/mmu_context.h> |
47 | #include <asm/prctl.h> | 48 | #include <asm/prctl.h> |
48 | #include <asm/desc.h> | 49 | #include <asm/desc.h> |
@@ -156,9 +157,7 @@ void cpu_idle(void) | |||
156 | } | 157 | } |
157 | 158 | ||
158 | tick_nohz_idle_exit(); | 159 | tick_nohz_idle_exit(); |
159 | preempt_enable_no_resched(); | 160 | schedule_preempt_disabled(); |
160 | schedule(); | ||
161 | preempt_disable(); | ||
162 | } | 161 | } |
163 | } | 162 | } |
164 | 163 | ||
@@ -286,6 +285,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, | |||
286 | 285 | ||
287 | set_tsk_thread_flag(p, TIF_FORK); | 286 | set_tsk_thread_flag(p, TIF_FORK); |
288 | 287 | ||
288 | p->fpu_counter = 0; | ||
289 | p->thread.io_bitmap_ptr = NULL; | 289 | p->thread.io_bitmap_ptr = NULL; |
290 | 290 | ||
291 | savesegment(gs, p->thread.gsindex); | 291 | savesegment(gs, p->thread.gsindex); |
@@ -341,6 +341,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, | |||
341 | loadsegment(es, _ds); | 341 | loadsegment(es, _ds); |
342 | loadsegment(ds, _ds); | 342 | loadsegment(ds, _ds); |
343 | load_gs_index(0); | 343 | load_gs_index(0); |
344 | current->thread.usersp = new_sp; | ||
344 | regs->ip = new_ip; | 345 | regs->ip = new_ip; |
345 | regs->sp = new_sp; | 346 | regs->sp = new_sp; |
346 | percpu_write(old_rsp, new_sp); | 347 | percpu_write(old_rsp, new_sp); |
@@ -386,18 +387,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
386 | int cpu = smp_processor_id(); | 387 | int cpu = smp_processor_id(); |
387 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 388 | struct tss_struct *tss = &per_cpu(init_tss, cpu); |
388 | unsigned fsindex, gsindex; | 389 | unsigned fsindex, gsindex; |
389 | bool preload_fpu; | 390 | fpu_switch_t fpu; |
390 | 391 | ||
391 | /* | 392 | fpu = switch_fpu_prepare(prev_p, next_p, cpu); |
392 | * If the task has used fpu the last 5 timeslices, just do a full | ||
393 | * restore of the math state immediately to avoid the trap; the | ||
394 | * chances of needing FPU soon are obviously high now | ||
395 | */ | ||
396 | preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; | ||
397 | |||
398 | /* we're going to use this soon, after a few expensive things */ | ||
399 | if (preload_fpu) | ||
400 | prefetch(next->fpu.state); | ||
401 | 393 | ||
402 | /* | 394 | /* |
403 | * Reload esp0, LDT and the page table pointer: | 395 | * Reload esp0, LDT and the page table pointer: |
@@ -427,13 +419,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
427 | 419 | ||
428 | load_TLS(next, cpu); | 420 | load_TLS(next, cpu); |
429 | 421 | ||
430 | /* Must be after DS reload */ | ||
431 | __unlazy_fpu(prev_p); | ||
432 | |||
433 | /* Make sure cpu is ready for new context */ | ||
434 | if (preload_fpu) | ||
435 | clts(); | ||
436 | |||
437 | /* | 422 | /* |
438 | * Leave lazy mode, flushing any hypercalls made here. | 423 | * Leave lazy mode, flushing any hypercalls made here. |
439 | * This must be done before restoring TLS segments so | 424 | * This must be done before restoring TLS segments so |
@@ -474,6 +459,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
474 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); | 459 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); |
475 | prev->gsindex = gsindex; | 460 | prev->gsindex = gsindex; |
476 | 461 | ||
462 | switch_fpu_finish(next_p, fpu); | ||
463 | |||
477 | /* | 464 | /* |
478 | * Switch the PDA and FPU contexts. | 465 | * Switch the PDA and FPU contexts. |
479 | */ | 466 | */ |
@@ -492,13 +479,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
492 | task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) | 479 | task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) |
493 | __switch_to_xtra(prev_p, next_p, tss); | 480 | __switch_to_xtra(prev_p, next_p, tss); |
494 | 481 | ||
495 | /* | ||
496 | * Preload the FPU context, now that we've determined that the | ||
497 | * task is likely to be using it. | ||
498 | */ | ||
499 | if (preload_fpu) | ||
500 | __math_state_restore(); | ||
501 | |||
502 | return prev_p; | 482 | return prev_p; |
503 | } | 483 | } |
504 | 484 | ||
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 50267386b766..78f05e438be5 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <asm/system.h> | 27 | #include <asm/system.h> |
28 | #include <asm/processor.h> | 28 | #include <asm/processor.h> |
29 | #include <asm/i387.h> | 29 | #include <asm/i387.h> |
30 | #include <asm/fpu-internal.h> | ||
30 | #include <asm/debugreg.h> | 31 | #include <asm/debugreg.h> |
31 | #include <asm/ldt.h> | 32 | #include <asm/ldt.h> |
32 | #include <asm/desc.h> | 33 | #include <asm/desc.h> |
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 37a458b521a6..d840e69a853c 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
@@ -39,6 +39,14 @@ static int reboot_mode; | |||
39 | enum reboot_type reboot_type = BOOT_ACPI; | 39 | enum reboot_type reboot_type = BOOT_ACPI; |
40 | int reboot_force; | 40 | int reboot_force; |
41 | 41 | ||
42 | /* This variable is used privately to keep track of whether or not | ||
43 | * reboot_type is still set to its default value (i.e., reboot= hasn't | ||
44 | * been set on the command line). This is needed so that we can | ||
45 | * suppress DMI scanning for reboot quirks. Without it, it's | ||
46 | * impossible to override a faulty reboot quirk without recompiling. | ||
47 | */ | ||
48 | static int reboot_default = 1; | ||
49 | |||
42 | #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) | 50 | #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) |
43 | static int reboot_cpu = -1; | 51 | static int reboot_cpu = -1; |
44 | #endif | 52 | #endif |
@@ -67,6 +75,12 @@ bool port_cf9_safe = false; | |||
67 | static int __init reboot_setup(char *str) | 75 | static int __init reboot_setup(char *str) |
68 | { | 76 | { |
69 | for (;;) { | 77 | for (;;) { |
78 | /* Having anything passed on the command line via | ||
79 | * reboot= will cause us to disable DMI checking | ||
80 | * below. | ||
81 | */ | ||
82 | reboot_default = 0; | ||
83 | |||
70 | switch (*str) { | 84 | switch (*str) { |
71 | case 'w': | 85 | case 'w': |
72 | reboot_mode = 0x1234; | 86 | reboot_mode = 0x1234; |
@@ -295,14 +309,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
295 | DMI_MATCH(DMI_BOARD_NAME, "P4S800"), | 309 | DMI_MATCH(DMI_BOARD_NAME, "P4S800"), |
296 | }, | 310 | }, |
297 | }, | 311 | }, |
298 | { /* Handle problems with rebooting on VersaLogic Menlow boards */ | ||
299 | .callback = set_bios_reboot, | ||
300 | .ident = "VersaLogic Menlow based board", | ||
301 | .matches = { | ||
302 | DMI_MATCH(DMI_BOARD_VENDOR, "VersaLogic Corporation"), | ||
303 | DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"), | ||
304 | }, | ||
305 | }, | ||
306 | { /* Handle reboot issue on Acer Aspire one */ | 312 | { /* Handle reboot issue on Acer Aspire one */ |
307 | .callback = set_kbd_reboot, | 313 | .callback = set_kbd_reboot, |
308 | .ident = "Acer Aspire One A110", | 314 | .ident = "Acer Aspire One A110", |
@@ -316,7 +322,12 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { | |||
316 | 322 | ||
317 | static int __init reboot_init(void) | 323 | static int __init reboot_init(void) |
318 | { | 324 | { |
319 | dmi_check_system(reboot_dmi_table); | 325 | /* Only do the DMI check if reboot_type hasn't been overridden |
326 | * on the command line | ||
327 | */ | ||
328 | if (reboot_default) { | ||
329 | dmi_check_system(reboot_dmi_table); | ||
330 | } | ||
320 | return 0; | 331 | return 0; |
321 | } | 332 | } |
322 | core_initcall(reboot_init); | 333 | core_initcall(reboot_init); |
@@ -465,7 +476,12 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { | |||
465 | 476 | ||
466 | static int __init pci_reboot_init(void) | 477 | static int __init pci_reboot_init(void) |
467 | { | 478 | { |
468 | dmi_check_system(pci_reboot_dmi_table); | 479 | /* Only do the DMI check if reboot_type hasn't been overridden |
480 | * on the command line | ||
481 | */ | ||
482 | if (reboot_default) { | ||
483 | dmi_check_system(pci_reboot_dmi_table); | ||
484 | } | ||
469 | return 0; | 485 | return 0; |
470 | } | 486 | } |
471 | core_initcall(pci_reboot_init); | 487 | core_initcall(pci_reboot_init); |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d7d5099fe874..88638883176a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -749,10 +749,16 @@ void __init setup_arch(char **cmdline_p) | |||
749 | #endif | 749 | #endif |
750 | #ifdef CONFIG_EFI | 750 | #ifdef CONFIG_EFI |
751 | if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, | 751 | if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, |
752 | EFI_LOADER_SIGNATURE, 4)) { | 752 | "EL32", 4)) { |
753 | efi_enabled = 1; | 753 | efi_enabled = 1; |
754 | efi_memblock_x86_reserve_range(); | 754 | efi_64bit = false; |
755 | } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, | ||
756 | "EL64", 4)) { | ||
757 | efi_enabled = 1; | ||
758 | efi_64bit = true; | ||
755 | } | 759 | } |
760 | if (efi_enabled && efi_memblock_x86_reserve_range()) | ||
761 | efi_enabled = 0; | ||
756 | #endif | 762 | #endif |
757 | 763 | ||
758 | x86_init.oem.arch_setup(); | 764 | x86_init.oem.arch_setup(); |
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 46a01bdc27e2..25edcfc9ba5b 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <asm/processor.h> | 24 | #include <asm/processor.h> |
25 | #include <asm/ucontext.h> | 25 | #include <asm/ucontext.h> |
26 | #include <asm/i387.h> | 26 | #include <asm/i387.h> |
27 | #include <asm/fpu-internal.h> | ||
27 | #include <asm/vdso.h> | 28 | #include <asm/vdso.h> |
28 | #include <asm/mce.h> | 29 | #include <asm/mce.h> |
29 | 30 | ||
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..5104a2b685cf 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -255,6 +255,7 @@ notrace static void __cpuinit start_secondary(void *unused) | |||
255 | * most necessary things. | 255 | * most necessary things. |
256 | */ | 256 | */ |
257 | cpu_init(); | 257 | cpu_init(); |
258 | x86_cpuinit.early_percpu_clock_init(); | ||
258 | preempt_disable(); | 259 | preempt_disable(); |
259 | smp_callin(); | 260 | smp_callin(); |
260 | 261 | ||
@@ -291,19 +292,6 @@ notrace static void __cpuinit start_secondary(void *unused) | |||
291 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; | 292 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; |
292 | x86_platform.nmi_init(); | 293 | x86_platform.nmi_init(); |
293 | 294 | ||
294 | /* | ||
295 | * Wait until the cpu which brought this one up marked it | ||
296 | * online before enabling interrupts. If we don't do that then | ||
297 | * we can end up waking up the softirq thread before this cpu | ||
298 | * reached the active state, which makes the scheduler unhappy | ||
299 | * and schedule the softirq thread on the wrong cpu. This is | ||
300 | * only observable with forced threaded interrupts, but in | ||
301 | * theory it could also happen w/o them. It's just way harder | ||
302 | * to achieve. | ||
303 | */ | ||
304 | while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask)) | ||
305 | cpu_relax(); | ||
306 | |||
307 | /* enable local interrupts */ | 295 | /* enable local interrupts */ |
308 | local_irq_enable(); | 296 | local_irq_enable(); |
309 | 297 | ||
@@ -740,8 +728,6 @@ do_rest: | |||
740 | * the targeted processor. | 728 | * the targeted processor. |
741 | */ | 729 | */ |
742 | 730 | ||
743 | printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip); | ||
744 | |||
745 | atomic_set(&init_deasserted, 0); | 731 | atomic_set(&init_deasserted, 0); |
746 | 732 | ||
747 | if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { | 733 | if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { |
@@ -791,9 +777,10 @@ do_rest: | |||
791 | schedule(); | 777 | schedule(); |
792 | } | 778 | } |
793 | 779 | ||
794 | if (cpumask_test_cpu(cpu, cpu_callin_mask)) | 780 | if (cpumask_test_cpu(cpu, cpu_callin_mask)) { |
781 | print_cpu_msr(&cpu_data(cpu)); | ||
795 | pr_debug("CPU%d: has booted.\n", cpu); | 782 | pr_debug("CPU%d: has booted.\n", cpu); |
796 | else { | 783 | } else { |
797 | boot_error = 1; | 784 | boot_error = 1; |
798 | if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) | 785 | if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) |
799 | == 0xA5A5A5A5) | 786 | == 0xA5A5A5A5) |
@@ -847,7 +834,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) | |||
847 | 834 | ||
848 | if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || | 835 | if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || |
849 | !physid_isset(apicid, phys_cpu_present_map) || | 836 | !physid_isset(apicid, phys_cpu_present_map) || |
850 | (!x2apic_mode && apicid >= 255)) { | 837 | !apic->apic_id_valid(apicid)) { |
851 | printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); | 838 | printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); |
852 | return -EINVAL; | 839 | return -EINVAL; |
853 | } | 840 | } |
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 051489082d59..ef59642ff1bf 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c | |||
@@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
195 | { | 195 | { |
196 | struct vm_area_struct *vma; | 196 | struct vm_area_struct *vma; |
197 | struct mm_struct *mm = current->mm; | 197 | struct mm_struct *mm = current->mm; |
198 | unsigned long addr = addr0; | 198 | unsigned long addr = addr0, start_addr; |
199 | 199 | ||
200 | /* requested length too big for entire address space */ | 200 | /* requested length too big for entire address space */ |
201 | if (len > TASK_SIZE) | 201 | if (len > TASK_SIZE) |
@@ -223,25 +223,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
223 | mm->free_area_cache = mm->mmap_base; | 223 | mm->free_area_cache = mm->mmap_base; |
224 | } | 224 | } |
225 | 225 | ||
226 | try_again: | ||
226 | /* either no address requested or can't fit in requested address hole */ | 227 | /* either no address requested or can't fit in requested address hole */ |
227 | addr = mm->free_area_cache; | 228 | start_addr = addr = mm->free_area_cache; |
228 | |||
229 | /* make sure it can fit in the remaining address space */ | ||
230 | if (addr > len) { | ||
231 | unsigned long tmp_addr = align_addr(addr - len, filp, | ||
232 | ALIGN_TOPDOWN); | ||
233 | |||
234 | vma = find_vma(mm, tmp_addr); | ||
235 | if (!vma || tmp_addr + len <= vma->vm_start) | ||
236 | /* remember the address as a hint for next time */ | ||
237 | return mm->free_area_cache = tmp_addr; | ||
238 | } | ||
239 | |||
240 | if (mm->mmap_base < len) | ||
241 | goto bottomup; | ||
242 | 229 | ||
243 | addr = mm->mmap_base-len; | 230 | if (addr < len) |
231 | goto fail; | ||
244 | 232 | ||
233 | addr -= len; | ||
245 | do { | 234 | do { |
246 | addr = align_addr(addr, filp, ALIGN_TOPDOWN); | 235 | addr = align_addr(addr, filp, ALIGN_TOPDOWN); |
247 | 236 | ||
@@ -263,6 +252,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
263 | addr = vma->vm_start-len; | 252 | addr = vma->vm_start-len; |
264 | } while (len < vma->vm_start); | 253 | } while (len < vma->vm_start); |
265 | 254 | ||
255 | fail: | ||
256 | /* | ||
257 | * if hint left us with no space for the requested | ||
258 | * mapping then try again: | ||
259 | */ | ||
260 | if (start_addr != mm->mmap_base) { | ||
261 | mm->free_area_cache = mm->mmap_base; | ||
262 | mm->cached_hole_size = 0; | ||
263 | goto try_again; | ||
264 | } | ||
265 | |||
266 | bottomup: | 266 | bottomup: |
267 | /* | 267 | /* |
268 | * A failed mmap() very likely causes application failure, | 268 | * A failed mmap() very likely causes application failure, |
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index dd5fbf4101fc..c6eba2b42673 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c | |||
@@ -57,9 +57,6 @@ EXPORT_SYMBOL(profile_pc); | |||
57 | */ | 57 | */ |
58 | static irqreturn_t timer_interrupt(int irq, void *dev_id) | 58 | static irqreturn_t timer_interrupt(int irq, void *dev_id) |
59 | { | 59 | { |
60 | /* Keep nmi watchdog up to date */ | ||
61 | inc_irq_stat(irq0_irqs); | ||
62 | |||
63 | global_clock_event->event_handler(global_clock_event); | 60 | global_clock_event->event_handler(global_clock_event); |
64 | 61 | ||
65 | /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ | 62 | /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 482ec3af2067..ec61d4c1b93b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -54,6 +54,7 @@ | |||
54 | #include <asm/traps.h> | 54 | #include <asm/traps.h> |
55 | #include <asm/desc.h> | 55 | #include <asm/desc.h> |
56 | #include <asm/i387.h> | 56 | #include <asm/i387.h> |
57 | #include <asm/fpu-internal.h> | ||
57 | #include <asm/mce.h> | 58 | #include <asm/mce.h> |
58 | 59 | ||
59 | #include <asm/mach_traps.h> | 60 | #include <asm/mach_traps.h> |
@@ -571,41 +572,18 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) | |||
571 | } | 572 | } |
572 | 573 | ||
573 | /* | 574 | /* |
574 | * __math_state_restore assumes that cr0.TS is already clear and the | ||
575 | * fpu state is all ready for use. Used during context switch. | ||
576 | */ | ||
577 | void __math_state_restore(void) | ||
578 | { | ||
579 | struct thread_info *thread = current_thread_info(); | ||
580 | struct task_struct *tsk = thread->task; | ||
581 | |||
582 | /* | ||
583 | * Paranoid restore. send a SIGSEGV if we fail to restore the state. | ||
584 | */ | ||
585 | if (unlikely(restore_fpu_checking(tsk))) { | ||
586 | stts(); | ||
587 | force_sig(SIGSEGV, tsk); | ||
588 | return; | ||
589 | } | ||
590 | |||
591 | thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ | ||
592 | tsk->fpu_counter++; | ||
593 | } | ||
594 | |||
595 | /* | ||
596 | * 'math_state_restore()' saves the current math information in the | 575 | * 'math_state_restore()' saves the current math information in the |
597 | * old math state array, and gets the new ones from the current task | 576 | * old math state array, and gets the new ones from the current task |
598 | * | 577 | * |
599 | * Careful.. There are problems with IBM-designed IRQ13 behaviour. | 578 | * Careful.. There are problems with IBM-designed IRQ13 behaviour. |
600 | * Don't touch unless you *really* know how it works. | 579 | * Don't touch unless you *really* know how it works. |
601 | * | 580 | * |
602 | * Must be called with kernel preemption disabled (in this case, | 581 | * Must be called with kernel preemption disabled (eg with local |
603 | * local interrupts are disabled at the call-site in entry.S). | 582 | * local interrupts as in the case of do_device_not_available). |
604 | */ | 583 | */ |
605 | asmlinkage void math_state_restore(void) | 584 | void math_state_restore(void) |
606 | { | 585 | { |
607 | struct thread_info *thread = current_thread_info(); | 586 | struct task_struct *tsk = current; |
608 | struct task_struct *tsk = thread->task; | ||
609 | 587 | ||
610 | if (!tsk_used_math(tsk)) { | 588 | if (!tsk_used_math(tsk)) { |
611 | local_irq_enable(); | 589 | local_irq_enable(); |
@@ -622,9 +600,17 @@ asmlinkage void math_state_restore(void) | |||
622 | local_irq_disable(); | 600 | local_irq_disable(); |
623 | } | 601 | } |
624 | 602 | ||
625 | clts(); /* Allow maths ops (or we recurse) */ | 603 | __thread_fpu_begin(tsk); |
604 | /* | ||
605 | * Paranoid restore. send a SIGSEGV if we fail to restore the state. | ||
606 | */ | ||
607 | if (unlikely(restore_fpu_checking(tsk))) { | ||
608 | __thread_fpu_end(tsk); | ||
609 | force_sig(SIGSEGV, tsk); | ||
610 | return; | ||
611 | } | ||
626 | 612 | ||
627 | __math_state_restore(); | 613 | tsk->fpu_counter++; |
628 | } | 614 | } |
629 | EXPORT_SYMBOL_GPL(math_state_restore); | 615 | EXPORT_SYMBOL_GPL(math_state_restore); |
630 | 616 | ||
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a62c201c97ec..899a03f2d181 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -620,7 +620,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) | |||
620 | 620 | ||
621 | if (cpu_khz) { | 621 | if (cpu_khz) { |
622 | *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; | 622 | *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; |
623 | *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); | 623 | *offset = ns_now - mult_frac(tsc_now, *scale, |
624 | (1UL << CYC2NS_SCALE_FACTOR)); | ||
624 | } | 625 | } |
625 | 626 | ||
626 | sched_clock_idle_wakeup_event(0); | 627 | sched_clock_idle_wakeup_event(0); |
@@ -629,7 +630,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) | |||
629 | 630 | ||
630 | static unsigned long long cyc2ns_suspend; | 631 | static unsigned long long cyc2ns_suspend; |
631 | 632 | ||
632 | void save_sched_clock_state(void) | 633 | void tsc_save_sched_clock_state(void) |
633 | { | 634 | { |
634 | if (!sched_clock_stable) | 635 | if (!sched_clock_stable) |
635 | return; | 636 | return; |
@@ -645,7 +646,7 @@ void save_sched_clock_state(void) | |||
645 | * that sched_clock() continues from the point where it was left off during | 646 | * that sched_clock() continues from the point where it was left off during |
646 | * suspend. | 647 | * suspend. |
647 | */ | 648 | */ |
648 | void restore_sched_clock_state(void) | 649 | void tsc_restore_sched_clock_state(void) |
649 | { | 650 | { |
650 | unsigned long long offset; | 651 | unsigned long long offset; |
651 | unsigned long flags; | 652 | unsigned long flags; |
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9eba29b46cb7..fc25e60a5884 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c | |||
@@ -42,7 +42,7 @@ static __cpuinitdata int nr_warps; | |||
42 | /* | 42 | /* |
43 | * TSC-warp measurement loop running on both CPUs: | 43 | * TSC-warp measurement loop running on both CPUs: |
44 | */ | 44 | */ |
45 | static __cpuinit void check_tsc_warp(void) | 45 | static __cpuinit void check_tsc_warp(unsigned int timeout) |
46 | { | 46 | { |
47 | cycles_t start, now, prev, end; | 47 | cycles_t start, now, prev, end; |
48 | int i; | 48 | int i; |
@@ -51,9 +51,9 @@ static __cpuinit void check_tsc_warp(void) | |||
51 | start = get_cycles(); | 51 | start = get_cycles(); |
52 | rdtsc_barrier(); | 52 | rdtsc_barrier(); |
53 | /* | 53 | /* |
54 | * The measurement runs for 20 msecs: | 54 | * The measurement runs for 'timeout' msecs: |
55 | */ | 55 | */ |
56 | end = start + tsc_khz * 20ULL; | 56 | end = start + (cycles_t) tsc_khz * timeout; |
57 | now = start; | 57 | now = start; |
58 | 58 | ||
59 | for (i = 0; ; i++) { | 59 | for (i = 0; ; i++) { |
@@ -99,6 +99,25 @@ static __cpuinit void check_tsc_warp(void) | |||
99 | } | 99 | } |
100 | 100 | ||
101 | /* | 101 | /* |
102 | * If the target CPU coming online doesn't have any of its core-siblings | ||
103 | * online, a timeout of 20msec will be used for the TSC-warp measurement | ||
104 | * loop. Otherwise a smaller timeout of 2msec will be used, as we have some | ||
105 | * information about this socket already (and this information grows as we | ||
106 | * have more and more logical-siblings in that socket). | ||
107 | * | ||
108 | * Ideally we should be able to skip the TSC sync check on the other | ||
109 | * core-siblings, if the first logical CPU in a socket passed the sync test. | ||
110 | * But as the TSC is per-logical CPU and can potentially be modified wrongly | ||
111 | * by the bios, TSC sync test for smaller duration should be able | ||
112 | * to catch such errors. Also this will catch the condition where all the | ||
113 | * cores in the socket doesn't get reset at the same time. | ||
114 | */ | ||
115 | static inline unsigned int loop_timeout(int cpu) | ||
116 | { | ||
117 | return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20; | ||
118 | } | ||
119 | |||
120 | /* | ||
102 | * Source CPU calls into this - it waits for the freshly booted | 121 | * Source CPU calls into this - it waits for the freshly booted |
103 | * target CPU to arrive and then starts the measurement: | 122 | * target CPU to arrive and then starts the measurement: |
104 | */ | 123 | */ |
@@ -135,7 +154,7 @@ void __cpuinit check_tsc_sync_source(int cpu) | |||
135 | */ | 154 | */ |
136 | atomic_inc(&start_count); | 155 | atomic_inc(&start_count); |
137 | 156 | ||
138 | check_tsc_warp(); | 157 | check_tsc_warp(loop_timeout(cpu)); |
139 | 158 | ||
140 | while (atomic_read(&stop_count) != cpus-1) | 159 | while (atomic_read(&stop_count) != cpus-1) |
141 | cpu_relax(); | 160 | cpu_relax(); |
@@ -183,7 +202,7 @@ void __cpuinit check_tsc_sync_target(void) | |||
183 | while (atomic_read(&start_count) != cpus) | 202 | while (atomic_read(&start_count) != cpus) |
184 | cpu_relax(); | 203 | cpu_relax(); |
185 | 204 | ||
186 | check_tsc_warp(); | 205 | check_tsc_warp(loop_timeout(smp_processor_id())); |
187 | 206 | ||
188 | /* | 207 | /* |
189 | * Ok, we are done: | 208 | * Ok, we are done: |
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index b466cab5ba15..328cb37bb827 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
@@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) | |||
172 | spinlock_t *ptl; | 172 | spinlock_t *ptl; |
173 | int i; | 173 | int i; |
174 | 174 | ||
175 | down_write(&mm->mmap_sem); | ||
175 | pgd = pgd_offset(mm, 0xA0000); | 176 | pgd = pgd_offset(mm, 0xA0000); |
176 | if (pgd_none_or_clear_bad(pgd)) | 177 | if (pgd_none_or_clear_bad(pgd)) |
177 | goto out; | 178 | goto out; |
@@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) | |||
190 | } | 191 | } |
191 | pte_unmap_unlock(pte, ptl); | 192 | pte_unmap_unlock(pte, ptl); |
192 | out: | 193 | out: |
194 | up_write(&mm->mmap_sem); | ||
193 | flush_tlb(); | 195 | flush_tlb(); |
194 | } | 196 | } |
195 | 197 | ||
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 947a06ccc673..e9f265fd79ae 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c | |||
@@ -91,6 +91,7 @@ struct x86_init_ops x86_init __initdata = { | |||
91 | }; | 91 | }; |
92 | 92 | ||
93 | struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { | 93 | struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { |
94 | .early_percpu_clock_init = x86_init_noop, | ||
94 | .setup_percpu_clockev = setup_secondary_APIC_clock, | 95 | .setup_percpu_clockev = setup_secondary_APIC_clock, |
95 | .fixup_cpu_id = x86_default_fixup_cpu_id, | 96 | .fixup_cpu_id = x86_default_fixup_cpu_id, |
96 | }; | 97 | }; |
@@ -107,7 +108,9 @@ struct x86_platform_ops x86_platform = { | |||
107 | .is_untracked_pat_range = is_ISA_range, | 108 | .is_untracked_pat_range = is_ISA_range, |
108 | .nmi_init = default_nmi_init, | 109 | .nmi_init = default_nmi_init, |
109 | .get_nmi_reason = default_get_nmi_reason, | 110 | .get_nmi_reason = default_get_nmi_reason, |
110 | .i8042_detect = default_i8042_detect | 111 | .i8042_detect = default_i8042_detect, |
112 | .save_sched_clock_state = tsc_save_sched_clock_state, | ||
113 | .restore_sched_clock_state = tsc_restore_sched_clock_state, | ||
111 | }; | 114 | }; |
112 | 115 | ||
113 | EXPORT_SYMBOL_GPL(x86_platform); | 116 | EXPORT_SYMBOL_GPL(x86_platform); |
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index a3911343976b..e62728e30b01 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/bootmem.h> | 6 | #include <linux/bootmem.h> |
7 | #include <linux/compat.h> | 7 | #include <linux/compat.h> |
8 | #include <asm/i387.h> | 8 | #include <asm/i387.h> |
9 | #include <asm/fpu-internal.h> | ||
9 | #ifdef CONFIG_IA32_EMULATION | 10 | #ifdef CONFIG_IA32_EMULATION |
10 | #include <asm/sigcontext32.h> | 11 | #include <asm/sigcontext32.h> |
11 | #endif | 12 | #endif |
@@ -47,7 +48,7 @@ void __sanitize_i387_state(struct task_struct *tsk) | |||
47 | if (!fx) | 48 | if (!fx) |
48 | return; | 49 | return; |
49 | 50 | ||
50 | BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); | 51 | BUG_ON(__thread_has_fpu(tsk)); |
51 | 52 | ||
52 | xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; | 53 | xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; |
53 | 54 | ||
@@ -168,7 +169,7 @@ int save_i387_xstate(void __user *buf) | |||
168 | if (!used_math()) | 169 | if (!used_math()) |
169 | return 0; | 170 | return 0; |
170 | 171 | ||
171 | if (task_thread_info(tsk)->status & TS_USEDFPU) { | 172 | if (user_has_fpu()) { |
172 | if (use_xsave()) | 173 | if (use_xsave()) |
173 | err = xsave_user(buf); | 174 | err = xsave_user(buf); |
174 | else | 175 | else |
@@ -176,8 +177,7 @@ int save_i387_xstate(void __user *buf) | |||
176 | 177 | ||
177 | if (err) | 178 | if (err) |
178 | return err; | 179 | return err; |
179 | task_thread_info(tsk)->status &= ~TS_USEDFPU; | 180 | user_fpu_end(); |
180 | stts(); | ||
181 | } else { | 181 | } else { |
182 | sanitize_i387_state(tsk); | 182 | sanitize_i387_state(tsk); |
183 | if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, | 183 | if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, |
@@ -292,10 +292,7 @@ int restore_i387_xstate(void __user *buf) | |||
292 | return err; | 292 | return err; |
293 | } | 293 | } |
294 | 294 | ||
295 | if (!(task_thread_info(current)->status & TS_USEDFPU)) { | 295 | user_fpu_begin(); |
296 | clts(); | ||
297 | task_thread_info(current)->status |= TS_USEDFPU; | ||
298 | } | ||
299 | if (use_xsave()) | 296 | if (use_xsave()) |
300 | err = restore_user_xstate(buf); | 297 | err = restore_user_xstate(buf); |
301 | else | 298 | else |