aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-12-05 18:30:21 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-05 18:30:21 -0500
commitc3fa27d1367fac63ac8533d6f20ea851d0d70a10 (patch)
treee7731554085e22b6b63411b1ebb401079f3e0bbb /arch/x86/kernel
parent96fa2b508d2d3fe040cf4ef2fffb955f0a537ea1 (diff)
parentd103d01e4b19f185d3c85f77402b605534c32e89 (diff)
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (470 commits) x86: Fix comments of register/stack access functions perf tools: Replace %m with %a in sscanf hw-breakpoints: Keep track of user disabled breakpoints tracing/syscalls: Make syscall events print callbacks static tracing: Add DEFINE_EVENT(), DEFINE_SINGLE_EVENT() support to docbook perf: Don't free perf_mmap_data until work has been done perf_event: Fix compile error perf tools: Fix _GNU_SOURCE macro related strndup() build error trace_syscalls: Remove unused syscall_name_to_nr() trace_syscalls: Simplify syscall profile trace_syscalls: Remove duplicate init_enter_##sname() trace_syscalls: Add syscall_nr field to struct syscall_metadata trace_syscalls: Remove enter_id exit_id trace_syscalls: Set event_enter_##sname->data to its metadata trace_syscalls: Remove unused event_syscall_enter and event_syscall_exit perf_event: Initialize data.period in perf_swevent_hrtimer() perf probe: Simplify event naming perf probe: Add --list option for listing current probe events perf probe: Add argv_split() from lib/argv_split.c perf probe: Move probe event utility functions to probe-event.c ...
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/common.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c103
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c29
-rw-r--r--arch/x86/kernel/cpu/perf_event.c205
-rw-r--r--arch/x86/kernel/entry_32.S24
-rw-r--r--arch/x86/kernel/entry_64.S8
-rw-r--r--arch/x86/kernel/hw_breakpoint.c555
-rw-r--r--arch/x86/kernel/irq.c12
-rw-r--r--arch/x86/kernel/kgdb.c6
-rw-r--r--arch/x86/kernel/kprobes.c243
-rw-r--r--arch/x86/kernel/machine_kexec_32.c2
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/process.c21
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c7
-rw-r--r--arch/x86/kernel/ptrace.c415
-rw-r--r--arch/x86/kernel/setup.c3
-rw-r--r--arch/x86/kernel/signal.c9
-rw-r--r--arch/x86/kernel/traps.c73
21 files changed, 1365 insertions, 365 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d8e5d0cdd678..4f2e66e29ecc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
41obj-y += bootflag.o e820.o 41obj-y += bootflag.o e820.o
42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
43obj-y += alternative.o i8253.o pci-nommu.o 43obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
44obj-y += tsc.o io_delay.o rtc.o 44obj-y += tsc.o io_delay.o rtc.o
45 45
46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 68537e957a9b..1d2cb383410e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -5,6 +5,7 @@
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
6ifdef CONFIG_FUNCTION_TRACER 6ifdef CONFIG_FUNCTION_TRACER
7CFLAGS_REMOVE_common.o = -pg 7CFLAGS_REMOVE_common.o = -pg
8CFLAGS_REMOVE_perf_event.o = -pg
8endif 9endif
9 10
10# Make sure load_percpu_segment has no stackprotector 11# Make sure load_percpu_segment has no stackprotector
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc25c2b4a567..9053be5d95cd 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -837,10 +837,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
837 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 837 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
838 } 838 }
839 839
840#ifdef CONFIG_X86_MCE
841 /* Init Machine Check Exception if available. */ 840 /* Init Machine Check Exception if available. */
842 mcheck_init(c); 841 mcheck_cpu_init(c);
843#endif
844 842
845 select_idle_routine(c); 843 select_idle_routine(c);
846 844
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 721a77ca8115..0bcaa3875863 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -46,6 +46,9 @@
46 46
47#include "mce-internal.h" 47#include "mce-internal.h"
48 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/mce.h>
51
49int mce_disabled __read_mostly; 52int mce_disabled __read_mostly;
50 53
51#define MISC_MCELOG_MINOR 227 54#define MISC_MCELOG_MINOR 227
@@ -85,18 +88,26 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
85static DEFINE_PER_CPU(struct mce, mces_seen); 88static DEFINE_PER_CPU(struct mce, mces_seen);
86static int cpu_missing; 89static int cpu_missing;
87 90
88static void default_decode_mce(struct mce *m) 91/*
92 * CPU/chipset specific EDAC code can register a notifier call here to print
93 * MCE errors in a human-readable form.
94 */
95ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
96EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
97
98static int default_decode_mce(struct notifier_block *nb, unsigned long val,
99 void *data)
89{ 100{
90 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 101 pr_emerg("No human readable MCE decoding support on this CPU type.\n");
91 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 102 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
103
104 return NOTIFY_STOP;
92} 105}
93 106
94/* 107static struct notifier_block mce_dec_nb = {
95 * CPU/chipset specific EDAC code can register a callback here to print 108 .notifier_call = default_decode_mce,
96 * MCE errors in a human-readable form: 109 .priority = -1,
97 */ 110};
98void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce;
99EXPORT_SYMBOL(x86_mce_decode_callback);
100 111
101/* MCA banks polled by the period polling timer for corrected events */ 112/* MCA banks polled by the period polling timer for corrected events */
102DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 113DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -141,6 +152,9 @@ void mce_log(struct mce *mce)
141{ 152{
142 unsigned next, entry; 153 unsigned next, entry;
143 154
155 /* Emit the trace record: */
156 trace_mce_record(mce);
157
144 mce->finished = 0; 158 mce->finished = 0;
145 wmb(); 159 wmb();
146 for (;;) { 160 for (;;) {
@@ -204,9 +218,9 @@ static void print_mce(struct mce *m)
204 218
205 /* 219 /*
206 * Print out human-readable details about the MCE error, 220 * Print out human-readable details about the MCE error,
207 * (if the CPU has an implementation for that): 221 * (if the CPU has an implementation for that)
208 */ 222 */
209 x86_mce_decode_callback(m); 223 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
210} 224}
211 225
212static void print_mce_head(void) 226static void print_mce_head(void)
@@ -1122,7 +1136,7 @@ static int check_interval = 5 * 60; /* 5 minutes */
1122static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1136static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1123static DEFINE_PER_CPU(struct timer_list, mce_timer); 1137static DEFINE_PER_CPU(struct timer_list, mce_timer);
1124 1138
1125static void mcheck_timer(unsigned long data) 1139static void mce_start_timer(unsigned long data)
1126{ 1140{
1127 struct timer_list *t = &per_cpu(mce_timer, data); 1141 struct timer_list *t = &per_cpu(mce_timer, data);
1128 int *n; 1142 int *n;
@@ -1187,7 +1201,7 @@ int mce_notify_irq(void)
1187} 1201}
1188EXPORT_SYMBOL_GPL(mce_notify_irq); 1202EXPORT_SYMBOL_GPL(mce_notify_irq);
1189 1203
1190static int mce_banks_init(void) 1204static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1191{ 1205{
1192 int i; 1206 int i;
1193 1207
@@ -1206,7 +1220,7 @@ static int mce_banks_init(void)
1206/* 1220/*
1207 * Initialize Machine Checks for a CPU. 1221 * Initialize Machine Checks for a CPU.
1208 */ 1222 */
1209static int __cpuinit mce_cap_init(void) 1223static int __cpuinit __mcheck_cpu_cap_init(void)
1210{ 1224{
1211 unsigned b; 1225 unsigned b;
1212 u64 cap; 1226 u64 cap;
@@ -1228,7 +1242,7 @@ static int __cpuinit mce_cap_init(void)
1228 WARN_ON(banks != 0 && b != banks); 1242 WARN_ON(banks != 0 && b != banks);
1229 banks = b; 1243 banks = b;
1230 if (!mce_banks) { 1244 if (!mce_banks) {
1231 int err = mce_banks_init(); 1245 int err = __mcheck_cpu_mce_banks_init();
1232 1246
1233 if (err) 1247 if (err)
1234 return err; 1248 return err;
@@ -1244,7 +1258,7 @@ static int __cpuinit mce_cap_init(void)
1244 return 0; 1258 return 0;
1245} 1259}
1246 1260
1247static void mce_init(void) 1261static void __mcheck_cpu_init_generic(void)
1248{ 1262{
1249 mce_banks_t all_banks; 1263 mce_banks_t all_banks;
1250 u64 cap; 1264 u64 cap;
@@ -1273,7 +1287,7 @@ static void mce_init(void)
1273} 1287}
1274 1288
1275/* Add per CPU specific workarounds here */ 1289/* Add per CPU specific workarounds here */
1276static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 1290static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1277{ 1291{
1278 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1292 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1279 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1293 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1341,7 +1355,7 @@ static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1341 return 0; 1355 return 0;
1342} 1356}
1343 1357
1344static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1358static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1345{ 1359{
1346 if (c->x86 != 5) 1360 if (c->x86 != 5)
1347 return; 1361 return;
@@ -1355,7 +1369,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1355 } 1369 }
1356} 1370}
1357 1371
1358static void mce_cpu_features(struct cpuinfo_x86 *c) 1372static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1359{ 1373{
1360 switch (c->x86_vendor) { 1374 switch (c->x86_vendor) {
1361 case X86_VENDOR_INTEL: 1375 case X86_VENDOR_INTEL:
@@ -1369,7 +1383,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
1369 } 1383 }
1370} 1384}
1371 1385
1372static void mce_init_timer(void) 1386static void __mcheck_cpu_init_timer(void)
1373{ 1387{
1374 struct timer_list *t = &__get_cpu_var(mce_timer); 1388 struct timer_list *t = &__get_cpu_var(mce_timer);
1375 int *n = &__get_cpu_var(mce_next_interval); 1389 int *n = &__get_cpu_var(mce_next_interval);
@@ -1380,7 +1394,7 @@ static void mce_init_timer(void)
1380 *n = check_interval * HZ; 1394 *n = check_interval * HZ;
1381 if (!*n) 1395 if (!*n)
1382 return; 1396 return;
1383 setup_timer(t, mcheck_timer, smp_processor_id()); 1397 setup_timer(t, mce_start_timer, smp_processor_id());
1384 t->expires = round_jiffies(jiffies + *n); 1398 t->expires = round_jiffies(jiffies + *n);
1385 add_timer_on(t, smp_processor_id()); 1399 add_timer_on(t, smp_processor_id());
1386} 1400}
@@ -1400,27 +1414,28 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
1400 * Called for each booted CPU to set up machine checks. 1414 * Called for each booted CPU to set up machine checks.
1401 * Must be called with preempt off: 1415 * Must be called with preempt off:
1402 */ 1416 */
1403void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1417void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1404{ 1418{
1405 if (mce_disabled) 1419 if (mce_disabled)
1406 return; 1420 return;
1407 1421
1408 mce_ancient_init(c); 1422 __mcheck_cpu_ancient_init(c);
1409 1423
1410 if (!mce_available(c)) 1424 if (!mce_available(c))
1411 return; 1425 return;
1412 1426
1413 if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { 1427 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1414 mce_disabled = 1; 1428 mce_disabled = 1;
1415 return; 1429 return;
1416 } 1430 }
1417 1431
1418 machine_check_vector = do_machine_check; 1432 machine_check_vector = do_machine_check;
1419 1433
1420 mce_init(); 1434 __mcheck_cpu_init_generic();
1421 mce_cpu_features(c); 1435 __mcheck_cpu_init_vendor(c);
1422 mce_init_timer(); 1436 __mcheck_cpu_init_timer();
1423 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1437 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1438
1424} 1439}
1425 1440
1426/* 1441/*
@@ -1640,6 +1655,15 @@ static int __init mcheck_enable(char *str)
1640} 1655}
1641__setup("mce", mcheck_enable); 1656__setup("mce", mcheck_enable);
1642 1657
1658int __init mcheck_init(void)
1659{
1660 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
1661
1662 mcheck_intel_therm_init();
1663
1664 return 0;
1665}
1666
1643/* 1667/*
1644 * Sysfs support 1668 * Sysfs support
1645 */ 1669 */
@@ -1648,7 +1672,7 @@ __setup("mce", mcheck_enable);
1648 * Disable machine checks on suspend and shutdown. We can't really handle 1672 * Disable machine checks on suspend and shutdown. We can't really handle
1649 * them later. 1673 * them later.
1650 */ 1674 */
1651static int mce_disable(void) 1675static int mce_disable_error_reporting(void)
1652{ 1676{
1653 int i; 1677 int i;
1654 1678
@@ -1663,12 +1687,12 @@ static int mce_disable(void)
1663 1687
1664static int mce_suspend(struct sys_device *dev, pm_message_t state) 1688static int mce_suspend(struct sys_device *dev, pm_message_t state)
1665{ 1689{
1666 return mce_disable(); 1690 return mce_disable_error_reporting();
1667} 1691}
1668 1692
1669static int mce_shutdown(struct sys_device *dev) 1693static int mce_shutdown(struct sys_device *dev)
1670{ 1694{
1671 return mce_disable(); 1695 return mce_disable_error_reporting();
1672} 1696}
1673 1697
1674/* 1698/*
@@ -1678,8 +1702,8 @@ static int mce_shutdown(struct sys_device *dev)
1678 */ 1702 */
1679static int mce_resume(struct sys_device *dev) 1703static int mce_resume(struct sys_device *dev)
1680{ 1704{
1681 mce_init(); 1705 __mcheck_cpu_init_generic();
1682 mce_cpu_features(&current_cpu_data); 1706 __mcheck_cpu_init_vendor(&current_cpu_data);
1683 1707
1684 return 0; 1708 return 0;
1685} 1709}
@@ -1689,8 +1713,8 @@ static void mce_cpu_restart(void *data)
1689 del_timer_sync(&__get_cpu_var(mce_timer)); 1713 del_timer_sync(&__get_cpu_var(mce_timer));
1690 if (!mce_available(&current_cpu_data)) 1714 if (!mce_available(&current_cpu_data))
1691 return; 1715 return;
1692 mce_init(); 1716 __mcheck_cpu_init_generic();
1693 mce_init_timer(); 1717 __mcheck_cpu_init_timer();
1694} 1718}
1695 1719
1696/* Reinit MCEs after user configuration changes */ 1720/* Reinit MCEs after user configuration changes */
@@ -1716,7 +1740,7 @@ static void mce_enable_ce(void *all)
1716 cmci_reenable(); 1740 cmci_reenable();
1717 cmci_recheck(); 1741 cmci_recheck();
1718 if (all) 1742 if (all)
1719 mce_init_timer(); 1743 __mcheck_cpu_init_timer();
1720} 1744}
1721 1745
1722static struct sysdev_class mce_sysclass = { 1746static struct sysdev_class mce_sysclass = {
@@ -1929,13 +1953,14 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
1929} 1953}
1930 1954
1931/* Make sure there are no machine checks on offlined CPUs. */ 1955/* Make sure there are no machine checks on offlined CPUs. */
1932static void mce_disable_cpu(void *h) 1956static void __cpuinit mce_disable_cpu(void *h)
1933{ 1957{
1934 unsigned long action = *(unsigned long *)h; 1958 unsigned long action = *(unsigned long *)h;
1935 int i; 1959 int i;
1936 1960
1937 if (!mce_available(&current_cpu_data)) 1961 if (!mce_available(&current_cpu_data))
1938 return; 1962 return;
1963
1939 if (!(action & CPU_TASKS_FROZEN)) 1964 if (!(action & CPU_TASKS_FROZEN))
1940 cmci_clear(); 1965 cmci_clear();
1941 for (i = 0; i < banks; i++) { 1966 for (i = 0; i < banks; i++) {
@@ -1946,7 +1971,7 @@ static void mce_disable_cpu(void *h)
1946 } 1971 }
1947} 1972}
1948 1973
1949static void mce_reenable_cpu(void *h) 1974static void __cpuinit mce_reenable_cpu(void *h)
1950{ 1975{
1951 unsigned long action = *(unsigned long *)h; 1976 unsigned long action = *(unsigned long *)h;
1952 int i; 1977 int i;
@@ -2025,7 +2050,7 @@ static __init void mce_init_banks(void)
2025 } 2050 }
2026} 2051}
2027 2052
2028static __init int mce_init_device(void) 2053static __init int mcheck_init_device(void)
2029{ 2054{
2030 int err; 2055 int err;
2031 int i = 0; 2056 int i = 0;
@@ -2053,7 +2078,7 @@ static __init int mce_init_device(void)
2053 return err; 2078 return err;
2054} 2079}
2055 2080
2056device_initcall(mce_init_device); 2081device_initcall(mcheck_init_device);
2057 2082
2058/* 2083/*
2059 * Old style boot options parsing. Only for compatibility. 2084 * Old style boot options parsing. Only for compatibility.
@@ -2101,7 +2126,7 @@ static int fake_panic_set(void *data, u64 val)
2101DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2126DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2102 fake_panic_set, "%llu\n"); 2127 fake_panic_set, "%llu\n");
2103 2128
2104static int __init mce_debugfs_init(void) 2129static int __init mcheck_debugfs_init(void)
2105{ 2130{
2106 struct dentry *dmce, *ffake_panic; 2131 struct dentry *dmce, *ffake_panic;
2107 2132
@@ -2115,5 +2140,5 @@ static int __init mce_debugfs_init(void)
2115 2140
2116 return 0; 2141 return 0;
2117} 2142}
2118late_initcall(mce_debugfs_init); 2143late_initcall(mcheck_debugfs_init);
2119#endif 2144#endif
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index b3a1dba75330..4fef985fc221 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -49,6 +49,8 @@ static DEFINE_PER_CPU(struct thermal_state, thermal_state);
49 49
50static atomic_t therm_throt_en = ATOMIC_INIT(0); 50static atomic_t therm_throt_en = ATOMIC_INIT(0);
51 51
52static u32 lvtthmr_init __read_mostly;
53
52#ifdef CONFIG_SYSFS 54#ifdef CONFIG_SYSFS
53#define define_therm_throt_sysdev_one_ro(_name) \ 55#define define_therm_throt_sysdev_one_ro(_name) \
54 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 56 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
@@ -254,6 +256,18 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
254 ack_APIC_irq(); 256 ack_APIC_irq();
255} 257}
256 258
259void __init mcheck_intel_therm_init(void)
260{
261 /*
262 * This function is only called on boot CPU. Save the init thermal
263 * LVT value on BSP and use that value to restore APs' thermal LVT
264 * entry BIOS programmed later
265 */
266 if (cpu_has(&boot_cpu_data, X86_FEATURE_ACPI) &&
267 cpu_has(&boot_cpu_data, X86_FEATURE_ACC))
268 lvtthmr_init = apic_read(APIC_LVTTHMR);
269}
270
257void intel_init_thermal(struct cpuinfo_x86 *c) 271void intel_init_thermal(struct cpuinfo_x86 *c)
258{ 272{
259 unsigned int cpu = smp_processor_id(); 273 unsigned int cpu = smp_processor_id();
@@ -270,7 +284,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
270 * since it might be delivered via SMI already: 284 * since it might be delivered via SMI already:
271 */ 285 */
272 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 286 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
273 h = apic_read(APIC_LVTTHMR); 287
288 /*
289 * The initial value of thermal LVT entries on all APs always reads
290 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
291 * sequence to them and LVT registers are reset to 0s except for
292 * the mask bits which are set to 1s when APs receive INIT IPI.
293 * Always restore the value that BIOS has programmed on AP based on
294 * BSP's info we saved since BIOS is always setting the same value
295 * for all threads/cores
296 */
297 apic_write(APIC_LVTTHMR, lvtthmr_init);
298
299 h = lvtthmr_init;
300
274 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { 301 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
275 printk(KERN_DEBUG 302 printk(KERN_DEBUG
276 "CPU%d: Thermal monitoring handled by SMI\n", cpu); 303 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b5801c311846..c1bbed1021d9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -77,6 +77,18 @@ struct cpu_hw_events {
77 struct debug_store *ds; 77 struct debug_store *ds;
78}; 78};
79 79
80struct event_constraint {
81 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
82 int code;
83};
84
85#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
86#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 }
87
88#define for_each_event_constraint(e, c) \
89 for ((e) = (c); (e)->idxmsk[0]; (e)++)
90
91
80/* 92/*
81 * struct x86_pmu - generic x86 pmu 93 * struct x86_pmu - generic x86 pmu
82 */ 94 */
@@ -102,6 +114,8 @@ struct x86_pmu {
102 u64 intel_ctrl; 114 u64 intel_ctrl;
103 void (*enable_bts)(u64 config); 115 void (*enable_bts)(u64 config);
104 void (*disable_bts)(void); 116 void (*disable_bts)(void);
117 int (*get_event_idx)(struct cpu_hw_events *cpuc,
118 struct hw_perf_event *hwc);
105}; 119};
106 120
107static struct x86_pmu x86_pmu __read_mostly; 121static struct x86_pmu x86_pmu __read_mostly;
@@ -110,6 +124,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
110 .enabled = 1, 124 .enabled = 1,
111}; 125};
112 126
127static const struct event_constraint *event_constraints;
128
113/* 129/*
114 * Not sure about some of these 130 * Not sure about some of these
115 */ 131 */
@@ -155,6 +171,16 @@ static u64 p6_pmu_raw_event(u64 hw_event)
155 return hw_event & P6_EVNTSEL_MASK; 171 return hw_event & P6_EVNTSEL_MASK;
156} 172}
157 173
174static const struct event_constraint intel_p6_event_constraints[] =
175{
176 EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
177 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
178 EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
179 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
180 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
181 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
182 EVENT_CONSTRAINT_END
183};
158 184
159/* 185/*
160 * Intel PerfMon v3. Used on Core2 and later. 186 * Intel PerfMon v3. Used on Core2 and later.
@@ -170,6 +196,35 @@ static const u64 intel_perfmon_event_map[] =
170 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 196 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
171}; 197};
172 198
199static const struct event_constraint intel_core_event_constraints[] =
200{
201 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
202 EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
203 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
204 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
205 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
206 EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
207 EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
208 EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
209 EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
210 EVENT_CONSTRAINT_END
211};
212
213static const struct event_constraint intel_nehalem_event_constraints[] =
214{
215 EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
216 EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
217 EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
218 EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
219 EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
220 EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
221 EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
222 EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
223 EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
224 EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
225 EVENT_CONSTRAINT_END
226};
227
173static u64 intel_pmu_event_map(int hw_event) 228static u64 intel_pmu_event_map(int hw_event)
174{ 229{
175 return intel_perfmon_event_map[hw_event]; 230 return intel_perfmon_event_map[hw_event];
@@ -190,7 +245,7 @@ static u64 __read_mostly hw_cache_event_ids
190 [PERF_COUNT_HW_CACHE_OP_MAX] 245 [PERF_COUNT_HW_CACHE_OP_MAX]
191 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 246 [PERF_COUNT_HW_CACHE_RESULT_MAX];
192 247
193static const u64 nehalem_hw_cache_event_ids 248static __initconst u64 nehalem_hw_cache_event_ids
194 [PERF_COUNT_HW_CACHE_MAX] 249 [PERF_COUNT_HW_CACHE_MAX]
195 [PERF_COUNT_HW_CACHE_OP_MAX] 250 [PERF_COUNT_HW_CACHE_OP_MAX]
196 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 251 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -281,7 +336,7 @@ static const u64 nehalem_hw_cache_event_ids
281 }, 336 },
282}; 337};
283 338
284static const u64 core2_hw_cache_event_ids 339static __initconst u64 core2_hw_cache_event_ids
285 [PERF_COUNT_HW_CACHE_MAX] 340 [PERF_COUNT_HW_CACHE_MAX]
286 [PERF_COUNT_HW_CACHE_OP_MAX] 341 [PERF_COUNT_HW_CACHE_OP_MAX]
287 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 342 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -372,7 +427,7 @@ static const u64 core2_hw_cache_event_ids
372 }, 427 },
373}; 428};
374 429
375static const u64 atom_hw_cache_event_ids 430static __initconst u64 atom_hw_cache_event_ids
376 [PERF_COUNT_HW_CACHE_MAX] 431 [PERF_COUNT_HW_CACHE_MAX]
377 [PERF_COUNT_HW_CACHE_OP_MAX] 432 [PERF_COUNT_HW_CACHE_OP_MAX]
378 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 433 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -469,7 +524,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
469#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL 524#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
470#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL 525#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
471#define CORE_EVNTSEL_INV_MASK 0x00800000ULL 526#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
472#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL 527#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
473 528
474#define CORE_EVNTSEL_MASK \ 529#define CORE_EVNTSEL_MASK \
475 (CORE_EVNTSEL_EVENT_MASK | \ 530 (CORE_EVNTSEL_EVENT_MASK | \
@@ -481,7 +536,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
481 return hw_event & CORE_EVNTSEL_MASK; 536 return hw_event & CORE_EVNTSEL_MASK;
482} 537}
483 538
484static const u64 amd_hw_cache_event_ids 539static __initconst u64 amd_hw_cache_event_ids
485 [PERF_COUNT_HW_CACHE_MAX] 540 [PERF_COUNT_HW_CACHE_MAX]
486 [PERF_COUNT_HW_CACHE_OP_MAX] 541 [PERF_COUNT_HW_CACHE_OP_MAX]
487 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 542 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -932,6 +987,8 @@ static int __hw_perf_event_init(struct perf_event *event)
932 */ 987 */
933 hwc->config = ARCH_PERFMON_EVENTSEL_INT; 988 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
934 989
990 hwc->idx = -1;
991
935 /* 992 /*
936 * Count user and OS events unless requested not to. 993 * Count user and OS events unless requested not to.
937 */ 994 */
@@ -1334,8 +1391,7 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1334 x86_pmu_enable_event(hwc, idx); 1391 x86_pmu_enable_event(hwc, idx);
1335} 1392}
1336 1393
1337static int 1394static int fixed_mode_idx(struct hw_perf_event *hwc)
1338fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1339{ 1395{
1340 unsigned int hw_event; 1396 unsigned int hw_event;
1341 1397
@@ -1349,6 +1405,12 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1349 if (!x86_pmu.num_events_fixed) 1405 if (!x86_pmu.num_events_fixed)
1350 return -1; 1406 return -1;
1351 1407
1408 /*
1409 * fixed counters do not take all possible filters
1410 */
1411 if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
1412 return -1;
1413
1352 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 1414 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1353 return X86_PMC_IDX_FIXED_INSTRUCTIONS; 1415 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1354 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) 1416 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
@@ -1360,22 +1422,57 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1360} 1422}
1361 1423
1362/* 1424/*
1363 * Find a PMC slot for the freshly enabled / scheduled in event: 1425 * generic counter allocator: get next free counter
1364 */ 1426 */
1365static int x86_pmu_enable(struct perf_event *event) 1427static int
1428gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1429{
1430 int idx;
1431
1432 idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
1433 return idx == x86_pmu.num_events ? -1 : idx;
1434}
1435
1436/*
1437 * intel-specific counter allocator: check event constraints
1438 */
1439static int
1440intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1441{
1442 const struct event_constraint *event_constraint;
1443 int i, code;
1444
1445 if (!event_constraints)
1446 goto skip;
1447
1448 code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
1449
1450 for_each_event_constraint(event_constraint, event_constraints) {
1451 if (code == event_constraint->code) {
1452 for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
1453 if (!test_and_set_bit(i, cpuc->used_mask))
1454 return i;
1455 }
1456 return -1;
1457 }
1458 }
1459skip:
1460 return gen_get_event_idx(cpuc, hwc);
1461}
1462
1463static int
1464x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1366{ 1465{
1367 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1368 struct hw_perf_event *hwc = &event->hw;
1369 int idx; 1466 int idx;
1370 1467
1371 idx = fixed_mode_idx(event, hwc); 1468 idx = fixed_mode_idx(hwc);
1372 if (idx == X86_PMC_IDX_FIXED_BTS) { 1469 if (idx == X86_PMC_IDX_FIXED_BTS) {
1373 /* BTS is already occupied. */ 1470 /* BTS is already occupied. */
1374 if (test_and_set_bit(idx, cpuc->used_mask)) 1471 if (test_and_set_bit(idx, cpuc->used_mask))
1375 return -EAGAIN; 1472 return -EAGAIN;
1376 1473
1377 hwc->config_base = 0; 1474 hwc->config_base = 0;
1378 hwc->event_base = 0; 1475 hwc->event_base = 0;
1379 hwc->idx = idx; 1476 hwc->idx = idx;
1380 } else if (idx >= 0) { 1477 } else if (idx >= 0) {
1381 /* 1478 /*
@@ -1396,20 +1493,35 @@ static int x86_pmu_enable(struct perf_event *event)
1396 } else { 1493 } else {
1397 idx = hwc->idx; 1494 idx = hwc->idx;
1398 /* Try to get the previous generic event again */ 1495 /* Try to get the previous generic event again */
1399 if (test_and_set_bit(idx, cpuc->used_mask)) { 1496 if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
1400try_generic: 1497try_generic:
1401 idx = find_first_zero_bit(cpuc->used_mask, 1498 idx = x86_pmu.get_event_idx(cpuc, hwc);
1402 x86_pmu.num_events); 1499 if (idx == -1)
1403 if (idx == x86_pmu.num_events)
1404 return -EAGAIN; 1500 return -EAGAIN;
1405 1501
1406 set_bit(idx, cpuc->used_mask); 1502 set_bit(idx, cpuc->used_mask);
1407 hwc->idx = idx; 1503 hwc->idx = idx;
1408 } 1504 }
1409 hwc->config_base = x86_pmu.eventsel; 1505 hwc->config_base = x86_pmu.eventsel;
1410 hwc->event_base = x86_pmu.perfctr; 1506 hwc->event_base = x86_pmu.perfctr;
1411 } 1507 }
1412 1508
1509 return idx;
1510}
1511
1512/*
1513 * Find a PMC slot for the freshly enabled / scheduled in event:
1514 */
1515static int x86_pmu_enable(struct perf_event *event)
1516{
1517 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1518 struct hw_perf_event *hwc = &event->hw;
1519 int idx;
1520
1521 idx = x86_schedule_event(cpuc, hwc);
1522 if (idx < 0)
1523 return idx;
1524
1413 perf_events_lapic_init(); 1525 perf_events_lapic_init();
1414 1526
1415 x86_pmu.disable(hwc, idx); 1527 x86_pmu.disable(hwc, idx);
@@ -1852,7 +1964,7 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1852 .priority = 1 1964 .priority = 1
1853}; 1965};
1854 1966
1855static struct x86_pmu p6_pmu = { 1967static __initconst struct x86_pmu p6_pmu = {
1856 .name = "p6", 1968 .name = "p6",
1857 .handle_irq = p6_pmu_handle_irq, 1969 .handle_irq = p6_pmu_handle_irq,
1858 .disable_all = p6_pmu_disable_all, 1970 .disable_all = p6_pmu_disable_all,
@@ -1877,9 +1989,10 @@ static struct x86_pmu p6_pmu = {
1877 */ 1989 */
1878 .event_bits = 32, 1990 .event_bits = 32,
1879 .event_mask = (1ULL << 32) - 1, 1991 .event_mask = (1ULL << 32) - 1,
1992 .get_event_idx = intel_get_event_idx,
1880}; 1993};
1881 1994
1882static struct x86_pmu intel_pmu = { 1995static __initconst struct x86_pmu intel_pmu = {
1883 .name = "Intel", 1996 .name = "Intel",
1884 .handle_irq = intel_pmu_handle_irq, 1997 .handle_irq = intel_pmu_handle_irq,
1885 .disable_all = intel_pmu_disable_all, 1998 .disable_all = intel_pmu_disable_all,
@@ -1900,9 +2013,10 @@ static struct x86_pmu intel_pmu = {
1900 .max_period = (1ULL << 31) - 1, 2013 .max_period = (1ULL << 31) - 1,
1901 .enable_bts = intel_pmu_enable_bts, 2014 .enable_bts = intel_pmu_enable_bts,
1902 .disable_bts = intel_pmu_disable_bts, 2015 .disable_bts = intel_pmu_disable_bts,
2016 .get_event_idx = intel_get_event_idx,
1903}; 2017};
1904 2018
1905static struct x86_pmu amd_pmu = { 2019static __initconst struct x86_pmu amd_pmu = {
1906 .name = "AMD", 2020 .name = "AMD",
1907 .handle_irq = amd_pmu_handle_irq, 2021 .handle_irq = amd_pmu_handle_irq,
1908 .disable_all = amd_pmu_disable_all, 2022 .disable_all = amd_pmu_disable_all,
@@ -1920,9 +2034,10 @@ static struct x86_pmu amd_pmu = {
1920 .apic = 1, 2034 .apic = 1,
1921 /* use highest bit to detect overflow */ 2035 /* use highest bit to detect overflow */
1922 .max_period = (1ULL << 47) - 1, 2036 .max_period = (1ULL << 47) - 1,
2037 .get_event_idx = gen_get_event_idx,
1923}; 2038};
1924 2039
1925static int p6_pmu_init(void) 2040static __init int p6_pmu_init(void)
1926{ 2041{
1927 switch (boot_cpu_data.x86_model) { 2042 switch (boot_cpu_data.x86_model) {
1928 case 1: 2043 case 1:
@@ -1932,10 +2047,12 @@ static int p6_pmu_init(void)
1932 case 7: 2047 case 7:
1933 case 8: 2048 case 8:
1934 case 11: /* Pentium III */ 2049 case 11: /* Pentium III */
2050 event_constraints = intel_p6_event_constraints;
1935 break; 2051 break;
1936 case 9: 2052 case 9:
1937 case 13: 2053 case 13:
1938 /* Pentium M */ 2054 /* Pentium M */
2055 event_constraints = intel_p6_event_constraints;
1939 break; 2056 break;
1940 default: 2057 default:
1941 pr_cont("unsupported p6 CPU model %d ", 2058 pr_cont("unsupported p6 CPU model %d ",
@@ -1954,7 +2071,7 @@ static int p6_pmu_init(void)
1954 return 0; 2071 return 0;
1955} 2072}
1956 2073
1957static int intel_pmu_init(void) 2074static __init int intel_pmu_init(void)
1958{ 2075{
1959 union cpuid10_edx edx; 2076 union cpuid10_edx edx;
1960 union cpuid10_eax eax; 2077 union cpuid10_eax eax;
@@ -2007,12 +2124,14 @@ static int intel_pmu_init(void)
2007 sizeof(hw_cache_event_ids)); 2124 sizeof(hw_cache_event_ids));
2008 2125
2009 pr_cont("Core2 events, "); 2126 pr_cont("Core2 events, ");
2127 event_constraints = intel_core_event_constraints;
2010 break; 2128 break;
2011 default: 2129 default:
2012 case 26: 2130 case 26:
2013 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 2131 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
2014 sizeof(hw_cache_event_ids)); 2132 sizeof(hw_cache_event_ids));
2015 2133
2134 event_constraints = intel_nehalem_event_constraints;
2016 pr_cont("Nehalem/Corei7 events, "); 2135 pr_cont("Nehalem/Corei7 events, ");
2017 break; 2136 break;
2018 case 28: 2137 case 28:
@@ -2025,7 +2144,7 @@ static int intel_pmu_init(void)
2025 return 0; 2144 return 0;
2026} 2145}
2027 2146
2028static int amd_pmu_init(void) 2147static __init int amd_pmu_init(void)
2029{ 2148{
2030 /* Performance-monitoring supported from K7 and later: */ 2149 /* Performance-monitoring supported from K7 and later: */
2031 if (boot_cpu_data.x86 < 6) 2150 if (boot_cpu_data.x86 < 6)
@@ -2105,11 +2224,47 @@ static const struct pmu pmu = {
2105 .unthrottle = x86_pmu_unthrottle, 2224 .unthrottle = x86_pmu_unthrottle,
2106}; 2225};
2107 2226
2227static int
2228validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
2229{
2230 struct hw_perf_event fake_event = event->hw;
2231
2232 if (event->pmu && event->pmu != &pmu)
2233 return 0;
2234
2235 return x86_schedule_event(cpuc, &fake_event) >= 0;
2236}
2237
2238static int validate_group(struct perf_event *event)
2239{
2240 struct perf_event *sibling, *leader = event->group_leader;
2241 struct cpu_hw_events fake_pmu;
2242
2243 memset(&fake_pmu, 0, sizeof(fake_pmu));
2244
2245 if (!validate_event(&fake_pmu, leader))
2246 return -ENOSPC;
2247
2248 list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
2249 if (!validate_event(&fake_pmu, sibling))
2250 return -ENOSPC;
2251 }
2252
2253 if (!validate_event(&fake_pmu, event))
2254 return -ENOSPC;
2255
2256 return 0;
2257}
2258
2108const struct pmu *hw_perf_event_init(struct perf_event *event) 2259const struct pmu *hw_perf_event_init(struct perf_event *event)
2109{ 2260{
2110 int err; 2261 int err;
2111 2262
2112 err = __hw_perf_event_init(event); 2263 err = __hw_perf_event_init(event);
2264 if (!err) {
2265 if (event->group_leader != event)
2266 err = validate_group(event);
2267 }
2113 if (err) { 2268 if (err) {
2114 if (event->destroy) 2269 if (event->destroy)
2115 event->destroy(event); 2270 event->destroy(event);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 7d52e9da5e0c..50b9c220e121 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -334,6 +334,10 @@ ENTRY(ret_from_fork)
334END(ret_from_fork) 334END(ret_from_fork)
335 335
336/* 336/*
337 * Interrupt exit functions should be protected against kprobes
338 */
339 .pushsection .kprobes.text, "ax"
340/*
337 * Return to user mode is not as complex as all this looks, 341 * Return to user mode is not as complex as all this looks,
338 * but we want the default path for a system call return to 342 * but we want the default path for a system call return to
339 * go as quickly as possible which is why some of this is 343 * go as quickly as possible which is why some of this is
@@ -383,6 +387,10 @@ need_resched:
383END(resume_kernel) 387END(resume_kernel)
384#endif 388#endif
385 CFI_ENDPROC 389 CFI_ENDPROC
390/*
391 * End of kprobes section
392 */
393 .popsection
386 394
387/* SYSENTER_RETURN points to after the "sysenter" instruction in 395/* SYSENTER_RETURN points to after the "sysenter" instruction in
388 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ 396 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
@@ -513,6 +521,10 @@ sysexit_audit:
513 PTGS_TO_GS_EX 521 PTGS_TO_GS_EX
514ENDPROC(ia32_sysenter_target) 522ENDPROC(ia32_sysenter_target)
515 523
524/*
525 * syscall stub including irq exit should be protected against kprobes
526 */
527 .pushsection .kprobes.text, "ax"
516 # system call handler stub 528 # system call handler stub
517ENTRY(system_call) 529ENTRY(system_call)
518 RING0_INT_FRAME # can't unwind into user space anyway 530 RING0_INT_FRAME # can't unwind into user space anyway
@@ -705,6 +717,10 @@ syscall_badsys:
705 jmp resume_userspace 717 jmp resume_userspace
706END(syscall_badsys) 718END(syscall_badsys)
707 CFI_ENDPROC 719 CFI_ENDPROC
720/*
721 * End of kprobes section
722 */
723 .popsection
708 724
709/* 725/*
710 * System calls that need a pt_regs pointer. 726 * System calls that need a pt_regs pointer.
@@ -814,6 +830,10 @@ common_interrupt:
814ENDPROC(common_interrupt) 830ENDPROC(common_interrupt)
815 CFI_ENDPROC 831 CFI_ENDPROC
816 832
833/*
834 * Irq entries should be protected against kprobes
835 */
836 .pushsection .kprobes.text, "ax"
817#define BUILD_INTERRUPT3(name, nr, fn) \ 837#define BUILD_INTERRUPT3(name, nr, fn) \
818ENTRY(name) \ 838ENTRY(name) \
819 RING0_INT_FRAME; \ 839 RING0_INT_FRAME; \
@@ -980,6 +1000,10 @@ ENTRY(spurious_interrupt_bug)
980 jmp error_code 1000 jmp error_code
981 CFI_ENDPROC 1001 CFI_ENDPROC
982END(spurious_interrupt_bug) 1002END(spurious_interrupt_bug)
1003/*
1004 * End of kprobes section
1005 */
1006 .popsection
983 1007
984ENTRY(kernel_thread_helper) 1008ENTRY(kernel_thread_helper)
985 pushl $0 # fake return address for unwinder 1009 pushl $0 # fake return address for unwinder
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index bd5bbddddf91..722df1b1152d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -803,6 +803,10 @@ END(interrupt)
803 call \func 803 call \func
804 .endm 804 .endm
805 805
806/*
807 * Interrupt entry/exit should be protected against kprobes
808 */
809 .pushsection .kprobes.text, "ax"
806 /* 810 /*
807 * The interrupt stubs push (~vector+0x80) onto the stack and 811 * The interrupt stubs push (~vector+0x80) onto the stack and
808 * then jump to common_interrupt. 812 * then jump to common_interrupt.
@@ -941,6 +945,10 @@ ENTRY(retint_kernel)
941 945
942 CFI_ENDPROC 946 CFI_ENDPROC
943END(common_interrupt) 947END(common_interrupt)
948/*
949 * End of kprobes section
950 */
951 .popsection
944 952
945/* 953/*
946 * APIC interrupts. 954 * APIC interrupts.
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..d42f65ac4927
--- /dev/null
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,555 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) 2009 IBM Corporation
18 * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Authors: Alan Stern <stern@rowland.harvard.edu>
21 * K.Prasad <prasad@linux.vnet.ibm.com>
22 * Frederic Weisbecker <fweisbec@gmail.com>
23 */
24
25/*
26 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
27 * using the CPU's debug registers.
28 */
29
30#include <linux/perf_event.h>
31#include <linux/hw_breakpoint.h>
32#include <linux/irqflags.h>
33#include <linux/notifier.h>
34#include <linux/kallsyms.h>
35#include <linux/kprobes.h>
36#include <linux/percpu.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/sched.h>
41#include <linux/init.h>
42#include <linux/smp.h>
43
44#include <asm/hw_breakpoint.h>
45#include <asm/processor.h>
46#include <asm/debugreg.h>
47
48/* Per cpu debug control register value */
49DEFINE_PER_CPU(unsigned long, cpu_dr7);
50EXPORT_PER_CPU_SYMBOL(cpu_dr7);
51
52/* Per cpu debug address registers values */
53static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
54
55/*
56 * Stores the breakpoints currently in use on each breakpoint address
57 * register for each cpus
58 */
59static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
60
61
62static inline unsigned long
63__encode_dr7(int drnum, unsigned int len, unsigned int type)
64{
65 unsigned long bp_info;
66
67 bp_info = (len | type) & 0xf;
68 bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
69 bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
70
71 return bp_info;
72}
73
74/*
75 * Encode the length, type, Exact, and Enable bits for a particular breakpoint
76 * as stored in debug register 7.
77 */
78unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
79{
80 return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
81}
82
83/*
84 * Decode the length and type bits for a particular breakpoint as
85 * stored in debug register 7. Return the "enabled" status.
86 */
87int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
88{
89 int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
90
91 *len = (bp_info & 0xc) | 0x40;
92 *type = (bp_info & 0x3) | 0x80;
93
94 return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
95}
96
97/*
98 * Install a perf counter breakpoint.
99 *
100 * We seek a free debug address register and use it for this
101 * breakpoint. Eventually we enable it in the debug control register.
102 *
103 * Atomic: we hold the counter->ctx->lock and we only handle variables
104 * and registers local to this cpu.
105 */
106int arch_install_hw_breakpoint(struct perf_event *bp)
107{
108 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
109 unsigned long *dr7;
110 int i;
111
112 for (i = 0; i < HBP_NUM; i++) {
113 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
114
115 if (!*slot) {
116 *slot = bp;
117 break;
118 }
119 }
120
121 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
122 return -EBUSY;
123
124 set_debugreg(info->address, i);
125 __get_cpu_var(cpu_debugreg[i]) = info->address;
126
127 dr7 = &__get_cpu_var(cpu_dr7);
128 *dr7 |= encode_dr7(i, info->len, info->type);
129
130 set_debugreg(*dr7, 7);
131
132 return 0;
133}
134
135/*
136 * Uninstall the breakpoint contained in the given counter.
137 *
138 * First we search the debug address register it uses and then we disable
139 * it.
140 *
141 * Atomic: we hold the counter->ctx->lock and we only handle variables
142 * and registers local to this cpu.
143 */
144void arch_uninstall_hw_breakpoint(struct perf_event *bp)
145{
146 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
147 unsigned long *dr7;
148 int i;
149
150 for (i = 0; i < HBP_NUM; i++) {
151 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
152
153 if (*slot == bp) {
154 *slot = NULL;
155 break;
156 }
157 }
158
159 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
160 return;
161
162 dr7 = &__get_cpu_var(cpu_dr7);
163 *dr7 &= ~__encode_dr7(i, info->len, info->type);
164
165 set_debugreg(*dr7, 7);
166}
167
168static int get_hbp_len(u8 hbp_len)
169{
170 unsigned int len_in_bytes = 0;
171
172 switch (hbp_len) {
173 case X86_BREAKPOINT_LEN_1:
174 len_in_bytes = 1;
175 break;
176 case X86_BREAKPOINT_LEN_2:
177 len_in_bytes = 2;
178 break;
179 case X86_BREAKPOINT_LEN_4:
180 len_in_bytes = 4;
181 break;
182#ifdef CONFIG_X86_64
183 case X86_BREAKPOINT_LEN_8:
184 len_in_bytes = 8;
185 break;
186#endif
187 }
188 return len_in_bytes;
189}
190
191/*
192 * Check for virtual address in user space.
193 */
194int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
195{
196 unsigned int len;
197
198 len = get_hbp_len(hbp_len);
199
200 return (va <= TASK_SIZE - len);
201}
202
203/*
204 * Check for virtual address in kernel space.
205 */
206static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
207{
208 unsigned int len;
209
210 len = get_hbp_len(hbp_len);
211
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213}
214
215/*
216 * Store a breakpoint's encoded address, length, and type.
217 */
218static int arch_store_info(struct perf_event *bp)
219{
220 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
221 /*
222 * For kernel-addresses, either the address or symbol name can be
223 * specified.
224 */
225 if (info->name)
226 info->address = (unsigned long)
227 kallsyms_lookup_name(info->name);
228 if (info->address)
229 return 0;
230
231 return -EINVAL;
232}
233
234int arch_bp_generic_fields(int x86_len, int x86_type,
235 int *gen_len, int *gen_type)
236{
237 /* Len */
238 switch (x86_len) {
239 case X86_BREAKPOINT_LEN_1:
240 *gen_len = HW_BREAKPOINT_LEN_1;
241 break;
242 case X86_BREAKPOINT_LEN_2:
243 *gen_len = HW_BREAKPOINT_LEN_2;
244 break;
245 case X86_BREAKPOINT_LEN_4:
246 *gen_len = HW_BREAKPOINT_LEN_4;
247 break;
248#ifdef CONFIG_X86_64
249 case X86_BREAKPOINT_LEN_8:
250 *gen_len = HW_BREAKPOINT_LEN_8;
251 break;
252#endif
253 default:
254 return -EINVAL;
255 }
256
257 /* Type */
258 switch (x86_type) {
259 case X86_BREAKPOINT_EXECUTE:
260 *gen_type = HW_BREAKPOINT_X;
261 break;
262 case X86_BREAKPOINT_WRITE:
263 *gen_type = HW_BREAKPOINT_W;
264 break;
265 case X86_BREAKPOINT_RW:
266 *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
267 break;
268 default:
269 return -EINVAL;
270 }
271
272 return 0;
273}
274
275
276static int arch_build_bp_info(struct perf_event *bp)
277{
278 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
279
280 info->address = bp->attr.bp_addr;
281
282 /* Len */
283 switch (bp->attr.bp_len) {
284 case HW_BREAKPOINT_LEN_1:
285 info->len = X86_BREAKPOINT_LEN_1;
286 break;
287 case HW_BREAKPOINT_LEN_2:
288 info->len = X86_BREAKPOINT_LEN_2;
289 break;
290 case HW_BREAKPOINT_LEN_4:
291 info->len = X86_BREAKPOINT_LEN_4;
292 break;
293#ifdef CONFIG_X86_64
294 case HW_BREAKPOINT_LEN_8:
295 info->len = X86_BREAKPOINT_LEN_8;
296 break;
297#endif
298 default:
299 return -EINVAL;
300 }
301
302 /* Type */
303 switch (bp->attr.bp_type) {
304 case HW_BREAKPOINT_W:
305 info->type = X86_BREAKPOINT_WRITE;
306 break;
307 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
308 info->type = X86_BREAKPOINT_RW;
309 break;
310 case HW_BREAKPOINT_X:
311 info->type = X86_BREAKPOINT_EXECUTE;
312 break;
313 default:
314 return -EINVAL;
315 }
316
317 return 0;
318}
319/*
320 * Validate the arch-specific HW Breakpoint register settings
321 */
322int arch_validate_hwbkpt_settings(struct perf_event *bp,
323 struct task_struct *tsk)
324{
325 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
326 unsigned int align;
327 int ret;
328
329
330 ret = arch_build_bp_info(bp);
331 if (ret)
332 return ret;
333
334 ret = -EINVAL;
335
336 if (info->type == X86_BREAKPOINT_EXECUTE)
337 /*
338 * Ptrace-refactoring code
339 * For now, we'll allow instruction breakpoint only for user-space
340 * addresses
341 */
342 if ((!arch_check_va_in_userspace(info->address, info->len)) &&
343 info->len != X86_BREAKPOINT_EXECUTE)
344 return ret;
345
346 switch (info->len) {
347 case X86_BREAKPOINT_LEN_1:
348 align = 0;
349 break;
350 case X86_BREAKPOINT_LEN_2:
351 align = 1;
352 break;
353 case X86_BREAKPOINT_LEN_4:
354 align = 3;
355 break;
356#ifdef CONFIG_X86_64
357 case X86_BREAKPOINT_LEN_8:
358 align = 7;
359 break;
360#endif
361 default:
362 return ret;
363 }
364
365 if (bp->callback)
366 ret = arch_store_info(bp);
367
368 if (ret < 0)
369 return ret;
370 /*
371 * Check that the low-order bits of the address are appropriate
372 * for the alignment implied by len.
373 */
374 if (info->address & align)
375 return -EINVAL;
376
377 /* Check that the virtual address is in the proper range */
378 if (tsk) {
379 if (!arch_check_va_in_userspace(info->address, info->len))
380 return -EFAULT;
381 } else {
382 if (!arch_check_va_in_kernelspace(info->address, info->len))
383 return -EFAULT;
384 }
385
386 return 0;
387}
388
389/*
390 * Dump the debug register contents to the user.
391 * We can't dump our per cpu values because it
392 * may contain cpu wide breakpoint, something that
393 * doesn't belong to the current task.
394 *
395 * TODO: include non-ptrace user breakpoints (perf)
396 */
397void aout_dump_debugregs(struct user *dump)
398{
399 int i;
400 int dr7 = 0;
401 struct perf_event *bp;
402 struct arch_hw_breakpoint *info;
403 struct thread_struct *thread = &current->thread;
404
405 for (i = 0; i < HBP_NUM; i++) {
406 bp = thread->ptrace_bps[i];
407
408 if (bp && !bp->attr.disabled) {
409 dump->u_debugreg[i] = bp->attr.bp_addr;
410 info = counter_arch_bp(bp);
411 dr7 |= encode_dr7(i, info->len, info->type);
412 } else {
413 dump->u_debugreg[i] = 0;
414 }
415 }
416
417 dump->u_debugreg[4] = 0;
418 dump->u_debugreg[5] = 0;
419 dump->u_debugreg[6] = current->thread.debugreg6;
420
421 dump->u_debugreg[7] = dr7;
422}
423EXPORT_SYMBOL_GPL(aout_dump_debugregs);
424
425/*
426 * Release the user breakpoints used by ptrace
427 */
428void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
429{
430 int i;
431 struct thread_struct *t = &tsk->thread;
432
433 for (i = 0; i < HBP_NUM; i++) {
434 unregister_hw_breakpoint(t->ptrace_bps[i]);
435 t->ptrace_bps[i] = NULL;
436 }
437}
438
439void hw_breakpoint_restore(void)
440{
441 set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
442 set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
443 set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
444 set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
445 set_debugreg(current->thread.debugreg6, 6);
446 set_debugreg(__get_cpu_var(cpu_dr7), 7);
447}
448EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
449
450/*
451 * Handle debug exception notifications.
452 *
453 * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
454 *
455 * NOTIFY_DONE returned if one of the following conditions is true.
456 * i) When the causative address is from user-space and the exception
457 * is a valid one, i.e. not triggered as a result of lazy debug register
458 * switching
459 * ii) When there are more bits than trap<n> set in DR6 register (such
460 * as BD, BS or BT) indicating that more than one debug condition is
461 * met and requires some more action in do_debug().
462 *
463 * NOTIFY_STOP returned for all other cases
464 *
465 */
466static int __kprobes hw_breakpoint_handler(struct die_args *args)
467{
468 int i, cpu, rc = NOTIFY_STOP;
469 struct perf_event *bp;
470 unsigned long dr7, dr6;
471 unsigned long *dr6_p;
472
473 /* The DR6 value is pointed by args->err */
474 dr6_p = (unsigned long *)ERR_PTR(args->err);
475 dr6 = *dr6_p;
476
477 /* Do an early return if no trap bits are set in DR6 */
478 if ((dr6 & DR_TRAP_BITS) == 0)
479 return NOTIFY_DONE;
480
481 get_debugreg(dr7, 7);
482 /* Disable breakpoints during exception handling */
483 set_debugreg(0UL, 7);
484 /*
485 * Assert that local interrupts are disabled
486 * Reset the DRn bits in the virtualized register value.
487 * The ptrace trigger routine will add in whatever is needed.
488 */
489 current->thread.debugreg6 &= ~DR_TRAP_BITS;
490 cpu = get_cpu();
491
492 /* Handle all the breakpoints that were triggered */
493 for (i = 0; i < HBP_NUM; ++i) {
494 if (likely(!(dr6 & (DR_TRAP0 << i))))
495 continue;
496
497 /*
498 * The counter may be concurrently released but that can only
499 * occur from a call_rcu() path. We can then safely fetch
500 * the breakpoint, use its callback, touch its counter
501 * while we are in an rcu_read_lock() path.
502 */
503 rcu_read_lock();
504
505 bp = per_cpu(bp_per_reg[i], cpu);
506 if (bp)
507 rc = NOTIFY_DONE;
508 /*
509 * Reset the 'i'th TRAP bit in dr6 to denote completion of
510 * exception handling
511 */
512 (*dr6_p) &= ~(DR_TRAP0 << i);
513 /*
514 * bp can be NULL due to lazy debug register switching
515 * or due to concurrent perf counter removing.
516 */
517 if (!bp) {
518 rcu_read_unlock();
519 break;
520 }
521
522 (bp->callback)(bp, args->regs);
523
524 rcu_read_unlock();
525 }
526 if (dr6 & (~DR_TRAP_BITS))
527 rc = NOTIFY_DONE;
528
529 set_debugreg(dr7, 7);
530 put_cpu();
531
532 return rc;
533}
534
535/*
536 * Handle debug exception notifications.
537 */
538int __kprobes hw_breakpoint_exceptions_notify(
539 struct notifier_block *unused, unsigned long val, void *data)
540{
541 if (val != DIE_DEBUG)
542 return NOTIFY_DONE;
543
544 return hw_breakpoint_handler(data);
545}
546
547void hw_breakpoint_pmu_read(struct perf_event *bp)
548{
549 /* TODO */
550}
551
552void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
553{
554 /* TODO */
555}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 04bbd5278568..19212cb01558 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -92,17 +92,17 @@ static int show_other_interrupts(struct seq_file *p, int prec)
92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); 92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
93 seq_printf(p, " TLB shootdowns\n"); 93 seq_printf(p, " TLB shootdowns\n");
94#endif 94#endif
95#ifdef CONFIG_X86_MCE 95#ifdef CONFIG_X86_THERMAL_VECTOR
96 seq_printf(p, "%*s: ", prec, "TRM"); 96 seq_printf(p, "%*s: ", prec, "TRM");
97 for_each_online_cpu(j) 97 for_each_online_cpu(j)
98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
99 seq_printf(p, " Thermal event interrupts\n"); 99 seq_printf(p, " Thermal event interrupts\n");
100# ifdef CONFIG_X86_MCE_THRESHOLD 100#endif
101#ifdef CONFIG_X86_MCE_THRESHOLD
101 seq_printf(p, "%*s: ", prec, "THR"); 102 seq_printf(p, "%*s: ", prec, "THR");
102 for_each_online_cpu(j) 103 for_each_online_cpu(j)
103 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 104 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
104 seq_printf(p, " Threshold APIC interrupts\n"); 105 seq_printf(p, " Threshold APIC interrupts\n");
105# endif
106#endif 106#endif
107#ifdef CONFIG_X86_MCE 107#ifdef CONFIG_X86_MCE
108 seq_printf(p, "%*s: ", prec, "MCE"); 108 seq_printf(p, "%*s: ", prec, "MCE");
@@ -194,11 +194,11 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
194 sum += irq_stats(cpu)->irq_call_count; 194 sum += irq_stats(cpu)->irq_call_count;
195 sum += irq_stats(cpu)->irq_tlb_count; 195 sum += irq_stats(cpu)->irq_tlb_count;
196#endif 196#endif
197#ifdef CONFIG_X86_MCE 197#ifdef CONFIG_X86_THERMAL_VECTOR
198 sum += irq_stats(cpu)->irq_thermal_count; 198 sum += irq_stats(cpu)->irq_thermal_count;
199# ifdef CONFIG_X86_MCE_THRESHOLD 199#endif
200#ifdef CONFIG_X86_MCE_THRESHOLD
200 sum += irq_stats(cpu)->irq_threshold_count; 201 sum += irq_stats(cpu)->irq_threshold_count;
201# endif
202#endif 202#endif
203#ifdef CONFIG_X86_MCE 203#ifdef CONFIG_X86_MCE
204 sum += per_cpu(mce_exception_count, cpu); 204 sum += per_cpu(mce_exception_count, cpu);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8d82a77a3f3b..34e86b67550c 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -43,6 +43,7 @@
43#include <linux/smp.h> 43#include <linux/smp.h>
44#include <linux/nmi.h> 44#include <linux/nmi.h>
45 45
46#include <asm/debugreg.h>
46#include <asm/apicdef.h> 47#include <asm/apicdef.h>
47#include <asm/system.h> 48#include <asm/system.h>
48 49
@@ -434,6 +435,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
434 "resuming...\n"); 435 "resuming...\n");
435 kgdb_arch_handle_exception(args->trapnr, args->signr, 436 kgdb_arch_handle_exception(args->trapnr, args->signr,
436 args->err, "c", "", regs); 437 args->err, "c", "", regs);
438 /*
439 * Reset the BS bit in dr6 (pointed by args->err) to
440 * denote completion of processing
441 */
442 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
437 443
438 return NOTIFY_STOP; 444 return NOTIFY_STOP;
439} 445}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7b5169d2b000..3fe86d706a14 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -48,12 +48,15 @@
48#include <linux/preempt.h> 48#include <linux/preempt.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kdebug.h> 50#include <linux/kdebug.h>
51#include <linux/kallsyms.h>
51 52
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
53#include <asm/desc.h> 54#include <asm/desc.h>
54#include <asm/pgtable.h> 55#include <asm/pgtable.h>
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/alternative.h> 57#include <asm/alternative.h>
58#include <asm/insn.h>
59#include <asm/debugreg.h>
57 60
58void jprobe_return_end(void); 61void jprobe_return_end(void);
59 62
@@ -106,50 +109,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
106 /* ----------------------------------------------- */ 109 /* ----------------------------------------------- */
107 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 110 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
108}; 111};
109static const u32 onebyte_has_modrm[256 / 32] = {
110 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
111 /* ----------------------------------------------- */
112 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
113 W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
114 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
115 W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
116 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
117 W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
118 W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
119 W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
120 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
121 W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
122 W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
123 W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
124 W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
125 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
126 W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
127 W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */
128 /* ----------------------------------------------- */
129 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
130};
131static const u32 twobyte_has_modrm[256 / 32] = {
132 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
133 /* ----------------------------------------------- */
134 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
135 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
136 W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
137 W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
138 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
139 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
140 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
141 W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
142 W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
143 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
144 W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
145 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
146 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
147 W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
148 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
149 W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */
150 /* ----------------------------------------------- */
151 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
152};
153#undef W 112#undef W
154 113
155struct kretprobe_blackpoint kretprobe_blacklist[] = { 114struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -244,6 +203,75 @@ retry:
244 } 203 }
245} 204}
246 205
206/* Recover the probed instruction at addr for further analysis. */
207static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
208{
209 struct kprobe *kp;
210 kp = get_kprobe((void *)addr);
211 if (!kp)
212 return -EINVAL;
213
214 /*
215 * Basically, kp->ainsn.insn has an original instruction.
216 * However, RIP-relative instruction can not do single-stepping
217 * at different place, fix_riprel() tweaks the displacement of
218 * that instruction. In that case, we can't recover the instruction
219 * from the kp->ainsn.insn.
220 *
221 * On the other hand, kp->opcode has a copy of the first byte of
222 * the probed instruction, which is overwritten by int3. And
223 * the instruction at kp->addr is not modified by kprobes except
224 * for the first byte, we can recover the original instruction
225 * from it and kp->opcode.
226 */
227 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
228 buf[0] = kp->opcode;
229 return 0;
230}
231
232/* Dummy buffers for kallsyms_lookup */
233static char __dummy_buf[KSYM_NAME_LEN];
234
235/* Check if paddr is at an instruction boundary */
236static int __kprobes can_probe(unsigned long paddr)
237{
238 int ret;
239 unsigned long addr, offset = 0;
240 struct insn insn;
241 kprobe_opcode_t buf[MAX_INSN_SIZE];
242
243 if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
244 return 0;
245
246 /* Decode instructions */
247 addr = paddr - offset;
248 while (addr < paddr) {
249 kernel_insn_init(&insn, (void *)addr);
250 insn_get_opcode(&insn);
251
252 /*
253 * Check if the instruction has been modified by another
254 * kprobe, in which case we replace the breakpoint by the
255 * original instruction in our buffer.
256 */
257 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
258 ret = recover_probed_instruction(buf, addr);
259 if (ret)
260 /*
261 * Another debugging subsystem might insert
262 * this breakpoint. In that case, we can't
263 * recover it.
264 */
265 return 0;
266 kernel_insn_init(&insn, buf);
267 }
268 insn_get_length(&insn);
269 addr += insn.length;
270 }
271
272 return (addr == paddr);
273}
274
247/* 275/*
248 * Returns non-zero if opcode modifies the interrupt flag. 276 * Returns non-zero if opcode modifies the interrupt flag.
249 */ 277 */
@@ -277,68 +305,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
277static void __kprobes fix_riprel(struct kprobe *p) 305static void __kprobes fix_riprel(struct kprobe *p)
278{ 306{
279#ifdef CONFIG_X86_64 307#ifdef CONFIG_X86_64
280 u8 *insn = p->ainsn.insn; 308 struct insn insn;
281 s64 disp; 309 kernel_insn_init(&insn, p->ainsn.insn);
282 int need_modrm;
283
284 /* Skip legacy instruction prefixes. */
285 while (1) {
286 switch (*insn) {
287 case 0x66:
288 case 0x67:
289 case 0x2e:
290 case 0x3e:
291 case 0x26:
292 case 0x64:
293 case 0x65:
294 case 0x36:
295 case 0xf0:
296 case 0xf3:
297 case 0xf2:
298 ++insn;
299 continue;
300 }
301 break;
302 }
303 310
304 /* Skip REX instruction prefix. */ 311 if (insn_rip_relative(&insn)) {
305 if (is_REX_prefix(insn)) 312 s64 newdisp;
306 ++insn; 313 u8 *disp;
307 314 insn_get_displacement(&insn);
308 if (*insn == 0x0f) { 315 /*
309 /* Two-byte opcode. */ 316 * The copied instruction uses the %rip-relative addressing
310 ++insn; 317 * mode. Adjust the displacement for the difference between
311 need_modrm = test_bit(*insn, 318 * the original location of this instruction and the location
312 (unsigned long *)twobyte_has_modrm); 319 * of the copy that will actually be run. The tricky bit here
313 } else 320 * is making sure that the sign extension happens correctly in
314 /* One-byte opcode. */ 321 * this calculation, since we need a signed 32-bit result to
315 need_modrm = test_bit(*insn, 322 * be sign-extended to 64 bits when it's added to the %rip
316 (unsigned long *)onebyte_has_modrm); 323 * value and yield the same 64-bit result that the sign-
317 324 * extension of the original signed 32-bit displacement would
318 if (need_modrm) { 325 * have given.
319 u8 modrm = *++insn; 326 */
320 if ((modrm & 0xc7) == 0x05) { 327 newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
321 /* %rip+disp32 addressing mode */ 328 (u8 *) p->ainsn.insn;
322 /* Displacement follows ModRM byte. */ 329 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
323 ++insn; 330 disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn);
324 /* 331 *(s32 *) disp = (s32) newdisp;
325 * The copied instruction uses the %rip-relative
326 * addressing mode. Adjust the displacement for the
327 * difference between the original location of this
328 * instruction and the location of the copy that will
329 * actually be run. The tricky bit here is making sure
330 * that the sign extension happens correctly in this
331 * calculation, since we need a signed 32-bit result to
332 * be sign-extended to 64 bits when it's added to the
333 * %rip value and yield the same 64-bit result that the
334 * sign-extension of the original signed 32-bit
335 * displacement would have given.
336 */
337 disp = (u8 *) p->addr + *((s32 *) insn) -
338 (u8 *) p->ainsn.insn;
339 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
340 *(s32 *)insn = (s32) disp;
341 }
342 } 332 }
343#endif 333#endif
344} 334}
@@ -359,6 +349,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
359 349
360int __kprobes arch_prepare_kprobe(struct kprobe *p) 350int __kprobes arch_prepare_kprobe(struct kprobe *p)
361{ 351{
352 if (!can_probe((unsigned long)p->addr))
353 return -EILSEQ;
362 /* insn: must be on special executable page on x86. */ 354 /* insn: must be on special executable page on x86. */
363 p->ainsn.insn = get_insn_slot(); 355 p->ainsn.insn = get_insn_slot();
364 if (!p->ainsn.insn) 356 if (!p->ainsn.insn)
@@ -472,17 +464,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
472{ 464{
473 switch (kcb->kprobe_status) { 465 switch (kcb->kprobe_status) {
474 case KPROBE_HIT_SSDONE: 466 case KPROBE_HIT_SSDONE:
475#ifdef CONFIG_X86_64
476 /* TODO: Provide re-entrancy from post_kprobes_handler() and
477 * avoid exception stack corruption while single-stepping on
478 * the instruction of the new probe.
479 */
480 arch_disarm_kprobe(p);
481 regs->ip = (unsigned long)p->addr;
482 reset_current_kprobe();
483 preempt_enable_no_resched();
484 break;
485#endif
486 case KPROBE_HIT_ACTIVE: 467 case KPROBE_HIT_ACTIVE:
487 save_previous_kprobe(kcb); 468 save_previous_kprobe(kcb);
488 set_current_kprobe(p, regs, kcb); 469 set_current_kprobe(p, regs, kcb);
@@ -491,18 +472,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
491 kcb->kprobe_status = KPROBE_REENTER; 472 kcb->kprobe_status = KPROBE_REENTER;
492 break; 473 break;
493 case KPROBE_HIT_SS: 474 case KPROBE_HIT_SS:
494 if (p == kprobe_running()) { 475 /* A probe has been hit in the codepath leading up to, or just
495 regs->flags &= ~X86_EFLAGS_TF; 476 * after, single-stepping of a probed instruction. This entire
496 regs->flags |= kcb->kprobe_saved_flags; 477 * codepath should strictly reside in .kprobes.text section.
497 return 0; 478 * Raise a BUG or we'll continue in an endless reentering loop
498 } else { 479 * and eventually a stack overflow.
499 /* A probe has been hit in the codepath leading up 480 */
500 * to, or just after, single-stepping of a probed 481 printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
501 * instruction. This entire codepath should strictly 482 p->addr);
502 * reside in .kprobes.text section. Raise a warning 483 dump_kprobe(p);
503 * to highlight this peculiar case. 484 BUG();
504 */
505 }
506 default: 485 default:
507 /* impossible cases */ 486 /* impossible cases */
508 WARN_ON(1); 487 WARN_ON(1);
@@ -967,8 +946,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
967 ret = NOTIFY_STOP; 946 ret = NOTIFY_STOP;
968 break; 947 break;
969 case DIE_DEBUG: 948 case DIE_DEBUG:
970 if (post_kprobe_handler(args->regs)) 949 if (post_kprobe_handler(args->regs)) {
950 /*
951 * Reset the BS bit in dr6 (pointed by args->err) to
952 * denote completion of processing
953 */
954 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
971 ret = NOTIFY_STOP; 955 ret = NOTIFY_STOP;
956 }
972 break; 957 break;
973 case DIE_GPF: 958 case DIE_GPF:
974 /* 959 /*
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index c1c429d00130..c843f8406da2 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
25#include <asm/desc.h> 25#include <asm/desc.h>
26#include <asm/system.h> 26#include <asm/system.h>
27#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
28#include <asm/debugreg.h>
28 29
29static void set_idt(void *newidt, __u16 limit) 30static void set_idt(void *newidt, __u16 limit)
30{ 31{
@@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image)
202 203
203 /* Interrupts aren't acceptable while we reboot */ 204 /* Interrupts aren't acceptable while we reboot */
204 local_irq_disable(); 205 local_irq_disable();
206 hw_breakpoint_disable();
205 207
206 if (image->preserve_context) { 208 if (image->preserve_context) {
207#ifdef CONFIG_X86_IO_APIC 209#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 84c3bf209e98..4a8bb82248ae 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,7 @@
18#include <asm/pgtable.h> 18#include <asm/pgtable.h>
19#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
20#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
21#include <asm/debugreg.h>
21 22
22static int init_one_level2_page(struct kimage *image, pgd_t *pgd, 23static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
23 unsigned long addr) 24 unsigned long addr)
@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image)
282 283
283 /* Interrupts aren't acceptable while we reboot */ 284 /* Interrupts aren't acceptable while we reboot */
284 local_irq_disable(); 285 local_irq_disable();
286 hw_breakpoint_disable();
285 287
286 if (image->preserve_context) { 288 if (image->preserve_context) {
287#ifdef CONFIG_X86_IO_APIC 289#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2b5776..744508e7cfdd 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -10,6 +10,7 @@
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <trace/events/power.h> 12#include <trace/events/power.h>
13#include <linux/hw_breakpoint.h>
13#include <asm/system.h> 14#include <asm/system.h>
14#include <asm/apic.h> 15#include <asm/apic.h>
15#include <asm/syscalls.h> 16#include <asm/syscalls.h>
@@ -17,6 +18,7 @@
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
18#include <asm/i387.h> 19#include <asm/i387.h>
19#include <asm/ds.h> 20#include <asm/ds.h>
21#include <asm/debugreg.h>
20 22
21unsigned long idle_halt; 23unsigned long idle_halt;
22EXPORT_SYMBOL(idle_halt); 24EXPORT_SYMBOL(idle_halt);
@@ -103,14 +105,7 @@ void flush_thread(void)
103 } 105 }
104#endif 106#endif
105 107
106 clear_tsk_thread_flag(tsk, TIF_DEBUG); 108 flush_ptrace_hw_breakpoint(tsk);
107
108 tsk->thread.debugreg0 = 0;
109 tsk->thread.debugreg1 = 0;
110 tsk->thread.debugreg2 = 0;
111 tsk->thread.debugreg3 = 0;
112 tsk->thread.debugreg6 = 0;
113 tsk->thread.debugreg7 = 0;
114 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 109 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
115 /* 110 /*
116 * Forget coprocessor state.. 111 * Forget coprocessor state..
@@ -192,16 +187,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
192 else if (next->debugctlmsr != prev->debugctlmsr) 187 else if (next->debugctlmsr != prev->debugctlmsr)
193 update_debugctlmsr(next->debugctlmsr); 188 update_debugctlmsr(next->debugctlmsr);
194 189
195 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
196 set_debugreg(next->debugreg0, 0);
197 set_debugreg(next->debugreg1, 1);
198 set_debugreg(next->debugreg2, 2);
199 set_debugreg(next->debugreg3, 3);
200 /* no 4 and 5 */
201 set_debugreg(next->debugreg6, 6);
202 set_debugreg(next->debugreg7, 7);
203 }
204
205 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 190 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
206 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 191 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
207 /* prev and next are different */ 192 /* prev and next are different */
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4cf79567cdab..d5bd3132ee70 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -58,6 +58,7 @@
58#include <asm/idle.h> 58#include <asm/idle.h>
59#include <asm/syscalls.h> 59#include <asm/syscalls.h>
60#include <asm/ds.h> 60#include <asm/ds.h>
61#include <asm/debugreg.h>
61 62
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 63asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 64
@@ -259,7 +260,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
259 260
260 task_user_gs(p) = get_user_gs(regs); 261 task_user_gs(p) = get_user_gs(regs);
261 262
263 p->thread.io_bitmap_ptr = NULL;
262 tsk = current; 264 tsk = current;
265 err = -ENOMEM;
266
267 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
268
263 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 269 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
264 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, 270 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
265 IO_BITMAP_BYTES, GFP_KERNEL); 271 IO_BITMAP_BYTES, GFP_KERNEL);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index eb62cbcaa490..70cf15873f3d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,6 +52,7 @@
52#include <asm/idle.h> 52#include <asm/idle.h>
53#include <asm/syscalls.h> 53#include <asm/syscalls.h>
54#include <asm/ds.h> 54#include <asm/ds.h>
55#include <asm/debugreg.h>
55 56
56asmlinkage extern void ret_from_fork(void); 57asmlinkage extern void ret_from_fork(void);
57 58
@@ -297,12 +298,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
297 298
298 p->thread.fs = me->thread.fs; 299 p->thread.fs = me->thread.fs;
299 p->thread.gs = me->thread.gs; 300 p->thread.gs = me->thread.gs;
301 p->thread.io_bitmap_ptr = NULL;
300 302
301 savesegment(gs, p->thread.gsindex); 303 savesegment(gs, p->thread.gsindex);
302 savesegment(fs, p->thread.fsindex); 304 savesegment(fs, p->thread.fsindex);
303 savesegment(es, p->thread.es); 305 savesegment(es, p->thread.es);
304 savesegment(ds, p->thread.ds); 306 savesegment(ds, p->thread.ds);
305 307
308 err = -ENOMEM;
309 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
310
306 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 311 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
307 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 312 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
308 if (!p->thread.io_bitmap_ptr) { 313 if (!p->thread.io_bitmap_ptr) {
@@ -341,6 +346,7 @@ out:
341 kfree(p->thread.io_bitmap_ptr); 346 kfree(p->thread.io_bitmap_ptr);
342 p->thread.io_bitmap_max = 0; 347 p->thread.io_bitmap_max = 0;
343 } 348 }
349
344 return err; 350 return err;
345} 351}
346 352
@@ -495,6 +501,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
495 */ 501 */
496 if (preload_fpu) 502 if (preload_fpu)
497 __math_state_restore(); 503 __math_state_restore();
504
498 return prev_p; 505 return prev_p;
499} 506}
500 507
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7b058a2dc66a..04d182a7cfdb 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,8 @@
22#include <linux/seccomp.h> 22#include <linux/seccomp.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/workqueue.h> 24#include <linux/workqueue.h>
25#include <linux/perf_event.h>
26#include <linux/hw_breakpoint.h>
25 27
26#include <asm/uaccess.h> 28#include <asm/uaccess.h>
27#include <asm/pgtable.h> 29#include <asm/pgtable.h>
@@ -34,6 +36,7 @@
34#include <asm/prctl.h> 36#include <asm/prctl.h>
35#include <asm/proto.h> 37#include <asm/proto.h>
36#include <asm/ds.h> 38#include <asm/ds.h>
39#include <asm/hw_breakpoint.h>
37 40
38#include "tls.h" 41#include "tls.h"
39 42
@@ -49,6 +52,118 @@ enum x86_regset {
49 REGSET_IOPERM32, 52 REGSET_IOPERM32,
50}; 53};
51 54
55struct pt_regs_offset {
56 const char *name;
57 int offset;
58};
59
60#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
61#define REG_OFFSET_END {.name = NULL, .offset = 0}
62
63static const struct pt_regs_offset regoffset_table[] = {
64#ifdef CONFIG_X86_64
65 REG_OFFSET_NAME(r15),
66 REG_OFFSET_NAME(r14),
67 REG_OFFSET_NAME(r13),
68 REG_OFFSET_NAME(r12),
69 REG_OFFSET_NAME(r11),
70 REG_OFFSET_NAME(r10),
71 REG_OFFSET_NAME(r9),
72 REG_OFFSET_NAME(r8),
73#endif
74 REG_OFFSET_NAME(bx),
75 REG_OFFSET_NAME(cx),
76 REG_OFFSET_NAME(dx),
77 REG_OFFSET_NAME(si),
78 REG_OFFSET_NAME(di),
79 REG_OFFSET_NAME(bp),
80 REG_OFFSET_NAME(ax),
81#ifdef CONFIG_X86_32
82 REG_OFFSET_NAME(ds),
83 REG_OFFSET_NAME(es),
84 REG_OFFSET_NAME(fs),
85 REG_OFFSET_NAME(gs),
86#endif
87 REG_OFFSET_NAME(orig_ax),
88 REG_OFFSET_NAME(ip),
89 REG_OFFSET_NAME(cs),
90 REG_OFFSET_NAME(flags),
91 REG_OFFSET_NAME(sp),
92 REG_OFFSET_NAME(ss),
93 REG_OFFSET_END,
94};
95
96/**
97 * regs_query_register_offset() - query register offset from its name
98 * @name: the name of a register
99 *
100 * regs_query_register_offset() returns the offset of a register in struct
101 * pt_regs from its name. If the name is invalid, this returns -EINVAL;
102 */
103int regs_query_register_offset(const char *name)
104{
105 const struct pt_regs_offset *roff;
106 for (roff = regoffset_table; roff->name != NULL; roff++)
107 if (!strcmp(roff->name, name))
108 return roff->offset;
109 return -EINVAL;
110}
111
112/**
113 * regs_query_register_name() - query register name from its offset
114 * @offset: the offset of a register in struct pt_regs.
115 *
116 * regs_query_register_name() returns the name of a register from its
117 * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
118 */
119const char *regs_query_register_name(unsigned int offset)
120{
121 const struct pt_regs_offset *roff;
122 for (roff = regoffset_table; roff->name != NULL; roff++)
123 if (roff->offset == offset)
124 return roff->name;
125 return NULL;
126}
127
128static const int arg_offs_table[] = {
129#ifdef CONFIG_X86_32
130 [0] = offsetof(struct pt_regs, ax),
131 [1] = offsetof(struct pt_regs, dx),
132 [2] = offsetof(struct pt_regs, cx)
133#else /* CONFIG_X86_64 */
134 [0] = offsetof(struct pt_regs, di),
135 [1] = offsetof(struct pt_regs, si),
136 [2] = offsetof(struct pt_regs, dx),
137 [3] = offsetof(struct pt_regs, cx),
138 [4] = offsetof(struct pt_regs, r8),
139 [5] = offsetof(struct pt_regs, r9)
140#endif
141};
142
143/**
144 * regs_get_argument_nth() - get Nth argument at function call
145 * @regs: pt_regs which contains registers at function entry.
146 * @n: argument number.
147 *
148 * regs_get_argument_nth() returns @n th argument of a function call.
149 * Since usually the kernel stack will be changed right after function entry,
150 * you must use this at function entry. If the @n th entry is NOT in the
151 * kernel stack or pt_regs, this returns 0.
152 */
153unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
154{
155 if (n < ARRAY_SIZE(arg_offs_table))
156 return *(unsigned long *)((char *)regs + arg_offs_table[n]);
157 else {
158 /*
159 * The typical case: arg n is on the stack.
160 * (Note: stack[0] = return address, so skip it)
161 */
162 n -= ARRAY_SIZE(arg_offs_table);
163 return regs_get_kernel_stack_nth(regs, 1 + n);
164 }
165}
166
52/* 167/*
53 * does not yet catch signals sent when the child dies. 168 * does not yet catch signals sent when the child dies.
54 * in exit.c or in signal.c. 169 * in exit.c or in signal.c.
@@ -137,11 +252,6 @@ static int set_segment_reg(struct task_struct *task,
137 return 0; 252 return 0;
138} 253}
139 254
140static unsigned long debugreg_addr_limit(struct task_struct *task)
141{
142 return TASK_SIZE - 3;
143}
144
145#else /* CONFIG_X86_64 */ 255#else /* CONFIG_X86_64 */
146 256
147#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) 257#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -266,15 +376,6 @@ static int set_segment_reg(struct task_struct *task,
266 return 0; 376 return 0;
267} 377}
268 378
269static unsigned long debugreg_addr_limit(struct task_struct *task)
270{
271#ifdef CONFIG_IA32_EMULATION
272 if (test_tsk_thread_flag(task, TIF_IA32))
273 return IA32_PAGE_OFFSET - 3;
274#endif
275 return TASK_SIZE_MAX - 7;
276}
277
278#endif /* CONFIG_X86_32 */ 379#endif /* CONFIG_X86_32 */
279 380
280static unsigned long get_flags(struct task_struct *task) 381static unsigned long get_flags(struct task_struct *task)
@@ -454,99 +555,239 @@ static int genregs_set(struct task_struct *target,
454 return ret; 555 return ret;
455} 556}
456 557
558static void ptrace_triggered(struct perf_event *bp, void *data)
559{
560 int i;
561 struct thread_struct *thread = &(current->thread);
562
563 /*
564 * Store in the virtual DR6 register the fact that the breakpoint
565 * was hit so the thread's debugger will see it.
566 */
567 for (i = 0; i < HBP_NUM; i++) {
568 if (thread->ptrace_bps[i] == bp)
569 break;
570 }
571
572 thread->debugreg6 |= (DR_TRAP0 << i);
573}
574
457/* 575/*
458 * This function is trivial and will be inlined by the compiler. 576 * Walk through every ptrace breakpoints for this thread and
459 * Having it separates the implementation details of debug 577 * build the dr7 value on top of their attributes.
460 * registers from the interface details of ptrace. 578 *
461 */ 579 */
462static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) 580static unsigned long ptrace_get_dr7(struct perf_event *bp[])
463{ 581{
464 switch (n) { 582 int i;
465 case 0: return child->thread.debugreg0; 583 int dr7 = 0;
466 case 1: return child->thread.debugreg1; 584 struct arch_hw_breakpoint *info;
467 case 2: return child->thread.debugreg2; 585
468 case 3: return child->thread.debugreg3; 586 for (i = 0; i < HBP_NUM; i++) {
469 case 6: return child->thread.debugreg6; 587 if (bp[i] && !bp[i]->attr.disabled) {
470 case 7: return child->thread.debugreg7; 588 info = counter_arch_bp(bp[i]);
589 dr7 |= encode_dr7(i, info->len, info->type);
590 }
471 } 591 }
472 return 0; 592
593 return dr7;
473} 594}
474 595
475static int ptrace_set_debugreg(struct task_struct *child, 596static struct perf_event *
476 int n, unsigned long data) 597ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
598 struct task_struct *tsk, int disabled)
477{ 599{
478 int i; 600 int err;
601 int gen_len, gen_type;
602 DEFINE_BREAKPOINT_ATTR(attr);
479 603
480 if (unlikely(n == 4 || n == 5)) 604 /*
481 return -EIO; 605 * We shoud have at least an inactive breakpoint at this
606 * slot. It means the user is writing dr7 without having
607 * written the address register first
608 */
609 if (!bp)
610 return ERR_PTR(-EINVAL);
482 611
483 if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) 612 err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
484 return -EIO; 613 if (err)
614 return ERR_PTR(err);
485 615
486 switch (n) { 616 attr = bp->attr;
487 case 0: child->thread.debugreg0 = data; break; 617 attr.bp_len = gen_len;
488 case 1: child->thread.debugreg1 = data; break; 618 attr.bp_type = gen_type;
489 case 2: child->thread.debugreg2 = data; break; 619 attr.disabled = disabled;
490 case 3: child->thread.debugreg3 = data; break;
491 620
492 case 6: 621 return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk);
493 if ((data & ~0xffffffffUL) != 0) 622}
494 return -EIO; 623
495 child->thread.debugreg6 = data; 624/*
496 break; 625 * Handle ptrace writes to debug register 7.
626 */
627static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
628{
629 struct thread_struct *thread = &(tsk->thread);
630 unsigned long old_dr7;
631 int i, orig_ret = 0, rc = 0;
632 int enabled, second_pass = 0;
633 unsigned len, type;
634 struct perf_event *bp;
635
636 data &= ~DR_CONTROL_RESERVED;
637 old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
638restore:
639 /*
640 * Loop through all the hardware breakpoints, making the
641 * appropriate changes to each.
642 */
643 for (i = 0; i < HBP_NUM; i++) {
644 enabled = decode_dr7(data, i, &len, &type);
645 bp = thread->ptrace_bps[i];
646
647 if (!enabled) {
648 if (bp) {
649 /*
650 * Don't unregister the breakpoints right-away,
651 * unless all register_user_hw_breakpoint()
652 * requests have succeeded. This prevents
653 * any window of opportunity for debug
654 * register grabbing by other users.
655 */
656 if (!second_pass)
657 continue;
658
659 thread->ptrace_bps[i] = NULL;
660 bp = ptrace_modify_breakpoint(bp, len, type,
661 tsk, 1);
662 if (IS_ERR(bp)) {
663 rc = PTR_ERR(bp);
664 thread->ptrace_bps[i] = NULL;
665 break;
666 }
667 thread->ptrace_bps[i] = bp;
668 }
669 continue;
670 }
671
672 bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
673
674 /* Incorrect bp, or we have a bug in bp API */
675 if (IS_ERR(bp)) {
676 rc = PTR_ERR(bp);
677 thread->ptrace_bps[i] = NULL;
678 break;
679 }
680 thread->ptrace_bps[i] = bp;
681 }
682 /*
683 * Make a second pass to free the remaining unused breakpoints
684 * or to restore the original breakpoints if an error occurred.
685 */
686 if (!second_pass) {
687 second_pass = 1;
688 if (rc < 0) {
689 orig_ret = rc;
690 data = old_dr7;
691 }
692 goto restore;
693 }
694 return ((orig_ret < 0) ? orig_ret : rc);
695}
696
697/*
698 * Handle PTRACE_PEEKUSR calls for the debug register area.
699 */
700static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
701{
702 struct thread_struct *thread = &(tsk->thread);
703 unsigned long val = 0;
497 704
498 case 7: 705 if (n < HBP_NUM) {
706 struct perf_event *bp;
707 bp = thread->ptrace_bps[n];
708 if (!bp)
709 return 0;
710 val = bp->hw.info.address;
711 } else if (n == 6) {
712 val = thread->debugreg6;
713 } else if (n == 7) {
714 val = ptrace_get_dr7(thread->ptrace_bps);
715 }
716 return val;
717}
718
719static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
720 unsigned long addr)
721{
722 struct perf_event *bp;
723 struct thread_struct *t = &tsk->thread;
724 DEFINE_BREAKPOINT_ATTR(attr);
725
726 if (!t->ptrace_bps[nr]) {
499 /* 727 /*
500 * Sanity-check data. Take one half-byte at once with 728 * Put stub len and type to register (reserve) an inactive but
501 * check = (val >> (16 + 4*i)) & 0xf. It contains the 729 * correct bp
502 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
503 * 2 and 3 are LENi. Given a list of invalid values,
504 * we do mask |= 1 << invalid_value, so that
505 * (mask >> check) & 1 is a correct test for invalid
506 * values.
507 *
508 * R/Wi contains the type of the breakpoint /
509 * watchpoint, LENi contains the length of the watched
510 * data in the watchpoint case.
511 *
512 * The invalid values are:
513 * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
514 * - R/Wi == 0x10 (break on I/O reads or writes), so
515 * mask |= 0x4444.
516 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
517 * 0x1110.
518 *
519 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
520 *
521 * See the Intel Manual "System Programming Guide",
522 * 15.2.4
523 *
524 * Note that LENi == 0x10 is defined on x86_64 in long
525 * mode (i.e. even for 32-bit userspace software, but
526 * 64-bit kernel), so the x86_64 mask value is 0x5454.
527 * See the AMD manual no. 24593 (AMD64 System Programming)
528 */ 730 */
529#ifdef CONFIG_X86_32 731 attr.bp_addr = addr;
530#define DR7_MASK 0x5f54 732 attr.bp_len = HW_BREAKPOINT_LEN_1;
531#else 733 attr.bp_type = HW_BREAKPOINT_W;
532#define DR7_MASK 0x5554 734 attr.disabled = 1;
533#endif 735
534 data &= ~DR_CONTROL_RESERVED; 736 bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
535 for (i = 0; i < 4; i++) 737 } else {
536 if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) 738 bp = t->ptrace_bps[nr];
537 return -EIO; 739 t->ptrace_bps[nr] = NULL;
538 child->thread.debugreg7 = data; 740
539 if (data) 741 attr = bp->attr;
540 set_tsk_thread_flag(child, TIF_DEBUG); 742 attr.bp_addr = addr;
541 else 743 bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk);
542 clear_tsk_thread_flag(child, TIF_DEBUG);
543 break;
544 } 744 }
745 /*
746 * CHECKME: the previous code returned -EIO if the addr wasn't a
747 * valid task virtual addr. The new one will return -EINVAL in this
748 * case.
749 * -EINVAL may be what we want for in-kernel breakpoints users, but
750 * -EIO looks better for ptrace, since we refuse a register writing
751 * for the user. And anyway this is the previous behaviour.
752 */
753 if (IS_ERR(bp))
754 return PTR_ERR(bp);
755
756 t->ptrace_bps[nr] = bp;
545 757
546 return 0; 758 return 0;
547} 759}
548 760
549/* 761/*
762 * Handle PTRACE_POKEUSR calls for the debug register area.
763 */
764int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
765{
766 struct thread_struct *thread = &(tsk->thread);
767 int rc = 0;
768
769 /* There are no DR4 or DR5 registers */
770 if (n == 4 || n == 5)
771 return -EIO;
772
773 if (n == 6) {
774 thread->debugreg6 = val;
775 goto ret_path;
776 }
777 if (n < HBP_NUM) {
778 rc = ptrace_set_breakpoint_addr(tsk, n, val);
779 if (rc)
780 return rc;
781 }
782 /* All that's left is DR7 */
783 if (n == 7)
784 rc = ptrace_write_dr7(tsk, val);
785
786ret_path:
787 return rc;
788}
789
790/*
550 * These access the current or another (stopped) task's io permission 791 * These access the current or another (stopped) task's io permission
551 * bitmap for debugging or core dump. 792 * bitmap for debugging or core dump.
552 */ 793 */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2a34f9c5be21..c0ca8f921c91 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -109,6 +109,7 @@
109#ifdef CONFIG_X86_64 109#ifdef CONFIG_X86_64
110#include <asm/numa_64.h> 110#include <asm/numa_64.h>
111#endif 111#endif
112#include <asm/mce.h>
112 113
113/* 114/*
114 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 115 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -1031,6 +1032,8 @@ void __init setup_arch(char **cmdline_p)
1031#endif 1032#endif
1032#endif 1033#endif
1033 x86_init.oem.banner(); 1034 x86_init.oem.banner();
1035
1036 mcheck_init();
1034} 1037}
1035 1038
1036#ifdef CONFIG_X86_32 1039#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76055ad..fbf3b07c8567 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -799,15 +799,6 @@ static void do_signal(struct pt_regs *regs)
799 799
800 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 800 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
801 if (signr > 0) { 801 if (signr > 0) {
802 /*
803 * Re-enable any watchpoints before delivering the
804 * signal to user space. The processor register will
805 * have been cleared if the watchpoint triggered
806 * inside the kernel.
807 */
808 if (current->thread.debugreg7)
809 set_debugreg(current->thread.debugreg7, 7);
810
811 /* Whee! Actually deliver the signal. */ 802 /* Whee! Actually deliver the signal. */
812 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 803 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
813 /* 804 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7e37dcee0cc3..33399176512a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -529,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) 529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
530{ 530{
531 struct task_struct *tsk = current; 531 struct task_struct *tsk = current;
532 unsigned long condition; 532 unsigned long dr6;
533 int si_code; 533 int si_code;
534 534
535 get_debugreg(condition, 6); 535 get_debugreg(dr6, 6);
536 536
537 /* Catch kmemcheck conditions first of all! */ 537 /* Catch kmemcheck conditions first of all! */
538 if (condition & DR_STEP && kmemcheck_trap(regs)) 538 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
539 return; 539 return;
540 540
541 /* DR6 may or may not be cleared by the CPU */
542 set_debugreg(0, 6);
541 /* 543 /*
542 * The processor cleared BTF, so don't mark that we need it set. 544 * The processor cleared BTF, so don't mark that we need it set.
543 */ 545 */
544 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); 546 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
545 tsk->thread.debugctlmsr = 0; 547 tsk->thread.debugctlmsr = 0;
546 548
547 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 549 /* Store the virtualized DR6 value */
548 SIGTRAP) == NOTIFY_STOP) 550 tsk->thread.debugreg6 = dr6;
551
552 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
553 SIGTRAP) == NOTIFY_STOP)
549 return; 554 return;
550 555
551 /* It's safe to allow irq's after DR6 has been saved */ 556 /* It's safe to allow irq's after DR6 has been saved */
552 preempt_conditional_sti(regs); 557 preempt_conditional_sti(regs);
553 558
554 /* Mask out spurious debug traps due to lazy DR7 setting */ 559 if (regs->flags & X86_VM_MASK) {
555 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 560 handle_vm86_trap((struct kernel_vm86_regs *) regs,
556 if (!tsk->thread.debugreg7) 561 error_code, 1);
557 goto clear_dr7; 562 return;
558 } 563 }
559 564
560#ifdef CONFIG_X86_32
561 if (regs->flags & X86_VM_MASK)
562 goto debug_vm86;
563#endif
564
565 /* Save debug status register where ptrace can see it */
566 tsk->thread.debugreg6 = condition;
567
568 /* 565 /*
569 * Single-stepping through TF: make sure we ignore any events in 566 * Single-stepping through system calls: ignore any exceptions in
570 * kernel space (but re-enable TF when returning to user mode). 567 * kernel space, but re-enable TF when returning to user mode.
568 *
569 * We already checked v86 mode above, so we can check for kernel mode
570 * by just checking the CPL of CS.
571 */ 571 */
572 if (condition & DR_STEP) { 572 if ((dr6 & DR_STEP) && !user_mode(regs)) {
573 if (!user_mode(regs)) 573 tsk->thread.debugreg6 &= ~DR_STEP;
574 goto clear_TF_reenable; 574 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
575 regs->flags &= ~X86_EFLAGS_TF;
575 } 576 }
576 577 si_code = get_si_code(tsk->thread.debugreg6);
577 si_code = get_si_code(condition); 578 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
578 /* Ok, finally something we can handle */ 579 send_sigtrap(tsk, regs, error_code, si_code);
579 send_sigtrap(tsk, regs, error_code, si_code);
580
581 /*
582 * Disable additional traps. They'll be re-enabled when
583 * the signal is delivered.
584 */
585clear_dr7:
586 set_debugreg(0, 7);
587 preempt_conditional_cli(regs); 580 preempt_conditional_cli(regs);
588 return;
589 581
590#ifdef CONFIG_X86_32
591debug_vm86:
592 /* reenable preemption: handle_vm86_trap() might sleep */
593 dec_preempt_count();
594 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
595 conditional_cli(regs);
596 return;
597#endif
598
599clear_TF_reenable:
600 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
601 regs->flags &= ~X86_EFLAGS_TF;
602 preempt_conditional_cli(regs);
603 return; 582 return;
604} 583}
605 584