diff options
author | Steven Rostedt <srostedt@redhat.com> | 2010-06-03 09:36:50 -0400 |
---|---|---|
committer | Steven Rostedt <rostedt@goodmis.org> | 2010-06-03 19:32:38 -0400 |
commit | 5168ae50a66e3ff7184c2b16d661bd6d70367e50 (patch) | |
tree | 2fb21fc3bd346e4f589605d940dfb1bacac30bf5 /kernel | |
parent | d1f74e20b5b064a130cd0743a256c2d3cfe84010 (diff) |
tracing: Remove ftrace_preempt_disable/enable
The ftrace_preempt_disable/enable functions were to address a
recursive race caused by the function tracer. The function tracer
traces all functions which makes it easily susceptible to recursion.
One area was preempt_enable(). This would call the scheduler and
the schedulre would call the function tracer and loop.
(So was it thought).
The ftrace_preempt_disable/enable was made to protect against recursion
inside the scheduler by storing the NEED_RESCHED flag. If it was
set before the ftrace_preempt_disable() it would not call schedule
on ftrace_preempt_enable(), thinking that if it was set before then
it would have already scheduled unless it was already in the scheduler.
This worked fine except in the case of SMP, where another task would set
the NEED_RESCHED flag for a task on another CPU, and then kick off an
IPI to trigger it. This could cause the NEED_RESCHED to be saved at
ftrace_preempt_disable() but the IPI to arrive in the the preempt
disabled section. The ftrace_preempt_enable() would not call the scheduler
because the flag was already set before entring the section.
This bug would cause a missed preemption check and cause lower latencies.
Investigating further, I found that the recusion caused by the function
tracer was not due to schedule(), but due to preempt_schedule(). Now
that preempt_schedule is completely annotated with notrace, the recusion
no longer is an issue.
Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/trace/ftrace.c | 5 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 38 | ||||
-rw-r--r-- | kernel/trace/trace.c | 5 | ||||
-rw-r--r-- | kernel/trace/trace.h | 48 | ||||
-rw-r--r-- | kernel/trace/trace_clock.c | 5 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 5 | ||||
-rw-r--r-- | kernel/trace/trace_functions.c | 6 | ||||
-rw-r--r-- | kernel/trace/trace_sched_wakeup.c | 5 | ||||
-rw-r--r-- | kernel/trace/trace_stack.c | 6 |
9 files changed, 24 insertions, 99 deletions
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6d2cb14f9449..0d88ce9b9fb8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -1883,7 +1883,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) | |||
1883 | struct hlist_head *hhd; | 1883 | struct hlist_head *hhd; |
1884 | struct hlist_node *n; | 1884 | struct hlist_node *n; |
1885 | unsigned long key; | 1885 | unsigned long key; |
1886 | int resched; | ||
1887 | 1886 | ||
1888 | key = hash_long(ip, FTRACE_HASH_BITS); | 1887 | key = hash_long(ip, FTRACE_HASH_BITS); |
1889 | 1888 | ||
@@ -1897,12 +1896,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) | |||
1897 | * period. This syncs the hash iteration and freeing of items | 1896 | * period. This syncs the hash iteration and freeing of items |
1898 | * on the hash. rcu_read_lock is too dangerous here. | 1897 | * on the hash. rcu_read_lock is too dangerous here. |
1899 | */ | 1898 | */ |
1900 | resched = ftrace_preempt_disable(); | 1899 | preempt_disable_notrace(); |
1901 | hlist_for_each_entry_rcu(entry, n, hhd, node) { | 1900 | hlist_for_each_entry_rcu(entry, n, hhd, node) { |
1902 | if (entry->ip == ip) | 1901 | if (entry->ip == ip) |
1903 | entry->ops->func(ip, parent_ip, &entry->data); | 1902 | entry->ops->func(ip, parent_ip, &entry->data); |
1904 | } | 1903 | } |
1905 | ftrace_preempt_enable(resched); | 1904 | preempt_enable_notrace(); |
1906 | } | 1905 | } |
1907 | 1906 | ||
1908 | static struct ftrace_ops trace_probe_ops __read_mostly = | 1907 | static struct ftrace_ops trace_probe_ops __read_mostly = |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7f6059c5aa94..c3d3cd9c2a53 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -2234,8 +2234,6 @@ static void trace_recursive_unlock(void) | |||
2234 | 2234 | ||
2235 | #endif | 2235 | #endif |
2236 | 2236 | ||
2237 | static DEFINE_PER_CPU(int, rb_need_resched); | ||
2238 | |||
2239 | /** | 2237 | /** |
2240 | * ring_buffer_lock_reserve - reserve a part of the buffer | 2238 | * ring_buffer_lock_reserve - reserve a part of the buffer |
2241 | * @buffer: the ring buffer to reserve from | 2239 | * @buffer: the ring buffer to reserve from |
@@ -2256,13 +2254,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) | |||
2256 | { | 2254 | { |
2257 | struct ring_buffer_per_cpu *cpu_buffer; | 2255 | struct ring_buffer_per_cpu *cpu_buffer; |
2258 | struct ring_buffer_event *event; | 2256 | struct ring_buffer_event *event; |
2259 | int cpu, resched; | 2257 | int cpu; |
2260 | 2258 | ||
2261 | if (ring_buffer_flags != RB_BUFFERS_ON) | 2259 | if (ring_buffer_flags != RB_BUFFERS_ON) |
2262 | return NULL; | 2260 | return NULL; |
2263 | 2261 | ||
2264 | /* If we are tracing schedule, we don't want to recurse */ | 2262 | /* If we are tracing schedule, we don't want to recurse */ |
2265 | resched = ftrace_preempt_disable(); | 2263 | preempt_disable_notrace(); |
2266 | 2264 | ||
2267 | if (atomic_read(&buffer->record_disabled)) | 2265 | if (atomic_read(&buffer->record_disabled)) |
2268 | goto out_nocheck; | 2266 | goto out_nocheck; |
@@ -2287,21 +2285,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) | |||
2287 | if (!event) | 2285 | if (!event) |
2288 | goto out; | 2286 | goto out; |
2289 | 2287 | ||
2290 | /* | ||
2291 | * Need to store resched state on this cpu. | ||
2292 | * Only the first needs to. | ||
2293 | */ | ||
2294 | |||
2295 | if (preempt_count() == 1) | ||
2296 | per_cpu(rb_need_resched, cpu) = resched; | ||
2297 | |||
2298 | return event; | 2288 | return event; |
2299 | 2289 | ||
2300 | out: | 2290 | out: |
2301 | trace_recursive_unlock(); | 2291 | trace_recursive_unlock(); |
2302 | 2292 | ||
2303 | out_nocheck: | 2293 | out_nocheck: |
2304 | ftrace_preempt_enable(resched); | 2294 | preempt_enable_notrace(); |
2305 | return NULL; | 2295 | return NULL; |
2306 | } | 2296 | } |
2307 | EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); | 2297 | EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); |
@@ -2347,13 +2337,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, | |||
2347 | 2337 | ||
2348 | trace_recursive_unlock(); | 2338 | trace_recursive_unlock(); |
2349 | 2339 | ||
2350 | /* | 2340 | preempt_enable_notrace(); |
2351 | * Only the last preempt count needs to restore preemption. | ||
2352 | */ | ||
2353 | if (preempt_count() == 1) | ||
2354 | ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); | ||
2355 | else | ||
2356 | preempt_enable_no_resched_notrace(); | ||
2357 | 2341 | ||
2358 | return 0; | 2342 | return 0; |
2359 | } | 2343 | } |
@@ -2461,13 +2445,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, | |||
2461 | 2445 | ||
2462 | trace_recursive_unlock(); | 2446 | trace_recursive_unlock(); |
2463 | 2447 | ||
2464 | /* | 2448 | preempt_enable_notrace(); |
2465 | * Only the last preempt count needs to restore preemption. | ||
2466 | */ | ||
2467 | if (preempt_count() == 1) | ||
2468 | ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); | ||
2469 | else | ||
2470 | preempt_enable_no_resched_notrace(); | ||
2471 | 2449 | ||
2472 | } | 2450 | } |
2473 | EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); | 2451 | EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); |
@@ -2493,12 +2471,12 @@ int ring_buffer_write(struct ring_buffer *buffer, | |||
2493 | struct ring_buffer_event *event; | 2471 | struct ring_buffer_event *event; |
2494 | void *body; | 2472 | void *body; |
2495 | int ret = -EBUSY; | 2473 | int ret = -EBUSY; |
2496 | int cpu, resched; | 2474 | int cpu; |
2497 | 2475 | ||
2498 | if (ring_buffer_flags != RB_BUFFERS_ON) | 2476 | if (ring_buffer_flags != RB_BUFFERS_ON) |
2499 | return -EBUSY; | 2477 | return -EBUSY; |
2500 | 2478 | ||
2501 | resched = ftrace_preempt_disable(); | 2479 | preempt_disable_notrace(); |
2502 | 2480 | ||
2503 | if (atomic_read(&buffer->record_disabled)) | 2481 | if (atomic_read(&buffer->record_disabled)) |
2504 | goto out; | 2482 | goto out; |
@@ -2528,7 +2506,7 @@ int ring_buffer_write(struct ring_buffer *buffer, | |||
2528 | 2506 | ||
2529 | ret = 0; | 2507 | ret = 0; |
2530 | out: | 2508 | out: |
2531 | ftrace_preempt_enable(resched); | 2509 | preempt_enable_notrace(); |
2532 | 2510 | ||
2533 | return ret; | 2511 | return ret; |
2534 | } | 2512 | } |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 55e48511d7c8..35727140f4fb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -1404,7 +1404,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
1404 | struct bprint_entry *entry; | 1404 | struct bprint_entry *entry; |
1405 | unsigned long flags; | 1405 | unsigned long flags; |
1406 | int disable; | 1406 | int disable; |
1407 | int resched; | ||
1408 | int cpu, len = 0, size, pc; | 1407 | int cpu, len = 0, size, pc; |
1409 | 1408 | ||
1410 | if (unlikely(tracing_selftest_running || tracing_disabled)) | 1409 | if (unlikely(tracing_selftest_running || tracing_disabled)) |
@@ -1414,7 +1413,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
1414 | pause_graph_tracing(); | 1413 | pause_graph_tracing(); |
1415 | 1414 | ||
1416 | pc = preempt_count(); | 1415 | pc = preempt_count(); |
1417 | resched = ftrace_preempt_disable(); | 1416 | preempt_disable_notrace(); |
1418 | cpu = raw_smp_processor_id(); | 1417 | cpu = raw_smp_processor_id(); |
1419 | data = tr->data[cpu]; | 1418 | data = tr->data[cpu]; |
1420 | 1419 | ||
@@ -1452,7 +1451,7 @@ out_unlock: | |||
1452 | 1451 | ||
1453 | out: | 1452 | out: |
1454 | atomic_dec_return(&data->disabled); | 1453 | atomic_dec_return(&data->disabled); |
1455 | ftrace_preempt_enable(resched); | 1454 | preempt_enable_notrace(); |
1456 | unpause_graph_tracing(); | 1455 | unpause_graph_tracing(); |
1457 | 1456 | ||
1458 | return len; | 1457 | return len; |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2cd96399463f..6c45e55097ce 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -628,54 +628,6 @@ enum trace_iterator_flags { | |||
628 | 628 | ||
629 | extern struct tracer nop_trace; | 629 | extern struct tracer nop_trace; |
630 | 630 | ||
631 | /** | ||
632 | * ftrace_preempt_disable - disable preemption scheduler safe | ||
633 | * | ||
634 | * When tracing can happen inside the scheduler, there exists | ||
635 | * cases that the tracing might happen before the need_resched | ||
636 | * flag is checked. If this happens and the tracer calls | ||
637 | * preempt_enable (after a disable), a schedule might take place | ||
638 | * causing an infinite recursion. | ||
639 | * | ||
640 | * To prevent this, we read the need_resched flag before | ||
641 | * disabling preemption. When we want to enable preemption we | ||
642 | * check the flag, if it is set, then we call preempt_enable_no_resched. | ||
643 | * Otherwise, we call preempt_enable. | ||
644 | * | ||
645 | * The rational for doing the above is that if need_resched is set | ||
646 | * and we have yet to reschedule, we are either in an atomic location | ||
647 | * (where we do not need to check for scheduling) or we are inside | ||
648 | * the scheduler and do not want to resched. | ||
649 | */ | ||
650 | static inline int ftrace_preempt_disable(void) | ||
651 | { | ||
652 | int resched; | ||
653 | |||
654 | resched = need_resched(); | ||
655 | preempt_disable_notrace(); | ||
656 | |||
657 | return resched; | ||
658 | } | ||
659 | |||
660 | /** | ||
661 | * ftrace_preempt_enable - enable preemption scheduler safe | ||
662 | * @resched: the return value from ftrace_preempt_disable | ||
663 | * | ||
664 | * This is a scheduler safe way to enable preemption and not miss | ||
665 | * any preemption checks. The disabled saved the state of preemption. | ||
666 | * If resched is set, then we are either inside an atomic or | ||
667 | * are inside the scheduler (we would have already scheduled | ||
668 | * otherwise). In this case, we do not want to call normal | ||
669 | * preempt_enable, but preempt_enable_no_resched instead. | ||
670 | */ | ||
671 | static inline void ftrace_preempt_enable(int resched) | ||
672 | { | ||
673 | if (resched) | ||
674 | preempt_enable_no_resched_notrace(); | ||
675 | else | ||
676 | preempt_enable_notrace(); | ||
677 | } | ||
678 | |||
679 | #ifdef CONFIG_BRANCH_TRACER | 631 | #ifdef CONFIG_BRANCH_TRACER |
680 | extern int enable_branch_tracing(struct trace_array *tr); | 632 | extern int enable_branch_tracing(struct trace_array *tr); |
681 | extern void disable_branch_tracing(void); | 633 | extern void disable_branch_tracing(void); |
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 9d589d8dcd1a..52fda6c04ac3 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c | |||
@@ -32,16 +32,15 @@ | |||
32 | u64 notrace trace_clock_local(void) | 32 | u64 notrace trace_clock_local(void) |
33 | { | 33 | { |
34 | u64 clock; | 34 | u64 clock; |
35 | int resched; | ||
36 | 35 | ||
37 | /* | 36 | /* |
38 | * sched_clock() is an architecture implemented, fast, scalable, | 37 | * sched_clock() is an architecture implemented, fast, scalable, |
39 | * lockless clock. It is not guaranteed to be coherent across | 38 | * lockless clock. It is not guaranteed to be coherent across |
40 | * CPUs, nor across CPU idle events. | 39 | * CPUs, nor across CPU idle events. |
41 | */ | 40 | */ |
42 | resched = ftrace_preempt_disable(); | 41 | preempt_disable_notrace(); |
43 | clock = sched_clock(); | 42 | clock = sched_clock(); |
44 | ftrace_preempt_enable(resched); | 43 | preempt_enable_notrace(); |
45 | 44 | ||
46 | return clock; | 45 | return clock; |
47 | } | 46 | } |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53cffc0b0801..a594f9a7ee3d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -1524,12 +1524,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) | |||
1524 | struct ftrace_entry *entry; | 1524 | struct ftrace_entry *entry; |
1525 | unsigned long flags; | 1525 | unsigned long flags; |
1526 | long disabled; | 1526 | long disabled; |
1527 | int resched; | ||
1528 | int cpu; | 1527 | int cpu; |
1529 | int pc; | 1528 | int pc; |
1530 | 1529 | ||
1531 | pc = preempt_count(); | 1530 | pc = preempt_count(); |
1532 | resched = ftrace_preempt_disable(); | 1531 | preempt_disable_notrace(); |
1533 | cpu = raw_smp_processor_id(); | 1532 | cpu = raw_smp_processor_id(); |
1534 | disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); | 1533 | disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); |
1535 | 1534 | ||
@@ -1551,7 +1550,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) | |||
1551 | 1550 | ||
1552 | out: | 1551 | out: |
1553 | atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); | 1552 | atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); |
1554 | ftrace_preempt_enable(resched); | 1553 | preempt_enable_notrace(); |
1555 | } | 1554 | } |
1556 | 1555 | ||
1557 | static struct ftrace_ops trace_ops __initdata = | 1556 | static struct ftrace_ops trace_ops __initdata = |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index b3f3776b0cd6..16aee4d44e8f 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) | |||
54 | struct trace_array_cpu *data; | 54 | struct trace_array_cpu *data; |
55 | unsigned long flags; | 55 | unsigned long flags; |
56 | long disabled; | 56 | long disabled; |
57 | int cpu, resched; | 57 | int cpu; |
58 | int pc; | 58 | int pc; |
59 | 59 | ||
60 | if (unlikely(!ftrace_function_enabled)) | 60 | if (unlikely(!ftrace_function_enabled)) |
61 | return; | 61 | return; |
62 | 62 | ||
63 | pc = preempt_count(); | 63 | pc = preempt_count(); |
64 | resched = ftrace_preempt_disable(); | 64 | preempt_disable_notrace(); |
65 | local_save_flags(flags); | 65 | local_save_flags(flags); |
66 | cpu = raw_smp_processor_id(); | 66 | cpu = raw_smp_processor_id(); |
67 | data = tr->data[cpu]; | 67 | data = tr->data[cpu]; |
@@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) | |||
71 | trace_function(tr, ip, parent_ip, flags, pc); | 71 | trace_function(tr, ip, parent_ip, flags, pc); |
72 | 72 | ||
73 | atomic_dec(&data->disabled); | 73 | atomic_dec(&data->disabled); |
74 | ftrace_preempt_enable(resched); | 74 | preempt_enable_notrace(); |
75 | } | 75 | } |
76 | 76 | ||
77 | static void | 77 | static void |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0e73bc2ef8c5..c9fd5bd02036 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -46,7 +46,6 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
46 | struct trace_array_cpu *data; | 46 | struct trace_array_cpu *data; |
47 | unsigned long flags; | 47 | unsigned long flags; |
48 | long disabled; | 48 | long disabled; |
49 | int resched; | ||
50 | int cpu; | 49 | int cpu; |
51 | int pc; | 50 | int pc; |
52 | 51 | ||
@@ -54,7 +53,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
54 | return; | 53 | return; |
55 | 54 | ||
56 | pc = preempt_count(); | 55 | pc = preempt_count(); |
57 | resched = ftrace_preempt_disable(); | 56 | preempt_disable_notrace(); |
58 | 57 | ||
59 | cpu = raw_smp_processor_id(); | 58 | cpu = raw_smp_processor_id(); |
60 | if (cpu != wakeup_current_cpu) | 59 | if (cpu != wakeup_current_cpu) |
@@ -74,7 +73,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
74 | out: | 73 | out: |
75 | atomic_dec(&data->disabled); | 74 | atomic_dec(&data->disabled); |
76 | out_enable: | 75 | out_enable: |
77 | ftrace_preempt_enable(resched); | 76 | preempt_enable_notrace(); |
78 | } | 77 | } |
79 | 78 | ||
80 | static struct ftrace_ops trace_ops __read_mostly = | 79 | static struct ftrace_ops trace_ops __read_mostly = |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index f4bc9b27de5f..056468eae7cf 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -110,12 +110,12 @@ static inline void check_stack(void) | |||
110 | static void | 110 | static void |
111 | stack_trace_call(unsigned long ip, unsigned long parent_ip) | 111 | stack_trace_call(unsigned long ip, unsigned long parent_ip) |
112 | { | 112 | { |
113 | int cpu, resched; | 113 | int cpu; |
114 | 114 | ||
115 | if (unlikely(!ftrace_enabled || stack_trace_disabled)) | 115 | if (unlikely(!ftrace_enabled || stack_trace_disabled)) |
116 | return; | 116 | return; |
117 | 117 | ||
118 | resched = ftrace_preempt_disable(); | 118 | preempt_disable_notrace(); |
119 | 119 | ||
120 | cpu = raw_smp_processor_id(); | 120 | cpu = raw_smp_processor_id(); |
121 | /* no atomic needed, we only modify this variable by this cpu */ | 121 | /* no atomic needed, we only modify this variable by this cpu */ |
@@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
127 | out: | 127 | out: |
128 | per_cpu(trace_active, cpu)--; | 128 | per_cpu(trace_active, cpu)--; |
129 | /* prevent recursion in schedule */ | 129 | /* prevent recursion in schedule */ |
130 | ftrace_preempt_enable(resched); | 130 | preempt_enable_notrace(); |
131 | } | 131 | } |
132 | 132 | ||
133 | static struct ftrace_ops trace_ops __read_mostly = | 133 | static struct ftrace_ops trace_ops __read_mostly = |