aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorSteven Rostedt <srostedt@redhat.com>2010-06-03 09:36:50 -0400
committerSteven Rostedt <rostedt@goodmis.org>2010-06-03 19:32:38 -0400
commit5168ae50a66e3ff7184c2b16d661bd6d70367e50 (patch)
tree2fb21fc3bd346e4f589605d940dfb1bacac30bf5 /kernel
parentd1f74e20b5b064a130cd0743a256c2d3cfe84010 (diff)
tracing: Remove ftrace_preempt_disable/enable
The ftrace_preempt_disable/enable functions were to address a recursive race caused by the function tracer. The function tracer traces all functions which makes it easily susceptible to recursion. One area was preempt_enable(). This would call the scheduler and the schedulre would call the function tracer and loop. (So was it thought). The ftrace_preempt_disable/enable was made to protect against recursion inside the scheduler by storing the NEED_RESCHED flag. If it was set before the ftrace_preempt_disable() it would not call schedule on ftrace_preempt_enable(), thinking that if it was set before then it would have already scheduled unless it was already in the scheduler. This worked fine except in the case of SMP, where another task would set the NEED_RESCHED flag for a task on another CPU, and then kick off an IPI to trigger it. This could cause the NEED_RESCHED to be saved at ftrace_preempt_disable() but the IPI to arrive in the the preempt disabled section. The ftrace_preempt_enable() would not call the scheduler because the flag was already set before entring the section. This bug would cause a missed preemption check and cause lower latencies. Investigating further, I found that the recusion caused by the function tracer was not due to schedule(), but due to preempt_schedule(). Now that preempt_schedule is completely annotated with notrace, the recusion no longer is an issue. Reported-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/trace/ftrace.c5
-rw-r--r--kernel/trace/ring_buffer.c38
-rw-r--r--kernel/trace/trace.c5
-rw-r--r--kernel/trace/trace.h48
-rw-r--r--kernel/trace/trace_clock.c5
-rw-r--r--kernel/trace/trace_events.c5
-rw-r--r--kernel/trace/trace_functions.c6
-rw-r--r--kernel/trace/trace_sched_wakeup.c5
-rw-r--r--kernel/trace/trace_stack.c6
9 files changed, 24 insertions, 99 deletions
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6d2cb14f9449..0d88ce9b9fb8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1883,7 +1883,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1883 struct hlist_head *hhd; 1883 struct hlist_head *hhd;
1884 struct hlist_node *n; 1884 struct hlist_node *n;
1885 unsigned long key; 1885 unsigned long key;
1886 int resched;
1887 1886
1888 key = hash_long(ip, FTRACE_HASH_BITS); 1887 key = hash_long(ip, FTRACE_HASH_BITS);
1889 1888
@@ -1897,12 +1896,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1897 * period. This syncs the hash iteration and freeing of items 1896 * period. This syncs the hash iteration and freeing of items
1898 * on the hash. rcu_read_lock is too dangerous here. 1897 * on the hash. rcu_read_lock is too dangerous here.
1899 */ 1898 */
1900 resched = ftrace_preempt_disable(); 1899 preempt_disable_notrace();
1901 hlist_for_each_entry_rcu(entry, n, hhd, node) { 1900 hlist_for_each_entry_rcu(entry, n, hhd, node) {
1902 if (entry->ip == ip) 1901 if (entry->ip == ip)
1903 entry->ops->func(ip, parent_ip, &entry->data); 1902 entry->ops->func(ip, parent_ip, &entry->data);
1904 } 1903 }
1905 ftrace_preempt_enable(resched); 1904 preempt_enable_notrace();
1906} 1905}
1907 1906
1908static struct ftrace_ops trace_probe_ops __read_mostly = 1907static struct ftrace_ops trace_probe_ops __read_mostly =
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7f6059c5aa94..c3d3cd9c2a53 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2234,8 +2234,6 @@ static void trace_recursive_unlock(void)
2234 2234
2235#endif 2235#endif
2236 2236
2237static DEFINE_PER_CPU(int, rb_need_resched);
2238
2239/** 2237/**
2240 * ring_buffer_lock_reserve - reserve a part of the buffer 2238 * ring_buffer_lock_reserve - reserve a part of the buffer
2241 * @buffer: the ring buffer to reserve from 2239 * @buffer: the ring buffer to reserve from
@@ -2256,13 +2254,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2256{ 2254{
2257 struct ring_buffer_per_cpu *cpu_buffer; 2255 struct ring_buffer_per_cpu *cpu_buffer;
2258 struct ring_buffer_event *event; 2256 struct ring_buffer_event *event;
2259 int cpu, resched; 2257 int cpu;
2260 2258
2261 if (ring_buffer_flags != RB_BUFFERS_ON) 2259 if (ring_buffer_flags != RB_BUFFERS_ON)
2262 return NULL; 2260 return NULL;
2263 2261
2264 /* If we are tracing schedule, we don't want to recurse */ 2262 /* If we are tracing schedule, we don't want to recurse */
2265 resched = ftrace_preempt_disable(); 2263 preempt_disable_notrace();
2266 2264
2267 if (atomic_read(&buffer->record_disabled)) 2265 if (atomic_read(&buffer->record_disabled))
2268 goto out_nocheck; 2266 goto out_nocheck;
@@ -2287,21 +2285,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2287 if (!event) 2285 if (!event)
2288 goto out; 2286 goto out;
2289 2287
2290 /*
2291 * Need to store resched state on this cpu.
2292 * Only the first needs to.
2293 */
2294
2295 if (preempt_count() == 1)
2296 per_cpu(rb_need_resched, cpu) = resched;
2297
2298 return event; 2288 return event;
2299 2289
2300 out: 2290 out:
2301 trace_recursive_unlock(); 2291 trace_recursive_unlock();
2302 2292
2303 out_nocheck: 2293 out_nocheck:
2304 ftrace_preempt_enable(resched); 2294 preempt_enable_notrace();
2305 return NULL; 2295 return NULL;
2306} 2296}
2307EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2297EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
@@ -2347,13 +2337,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
2347 2337
2348 trace_recursive_unlock(); 2338 trace_recursive_unlock();
2349 2339
2350 /* 2340 preempt_enable_notrace();
2351 * Only the last preempt count needs to restore preemption.
2352 */
2353 if (preempt_count() == 1)
2354 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2355 else
2356 preempt_enable_no_resched_notrace();
2357 2341
2358 return 0; 2342 return 0;
2359} 2343}
@@ -2461,13 +2445,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
2461 2445
2462 trace_recursive_unlock(); 2446 trace_recursive_unlock();
2463 2447
2464 /* 2448 preempt_enable_notrace();
2465 * Only the last preempt count needs to restore preemption.
2466 */
2467 if (preempt_count() == 1)
2468 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2469 else
2470 preempt_enable_no_resched_notrace();
2471 2449
2472} 2450}
2473EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); 2451EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
@@ -2493,12 +2471,12 @@ int ring_buffer_write(struct ring_buffer *buffer,
2493 struct ring_buffer_event *event; 2471 struct ring_buffer_event *event;
2494 void *body; 2472 void *body;
2495 int ret = -EBUSY; 2473 int ret = -EBUSY;
2496 int cpu, resched; 2474 int cpu;
2497 2475
2498 if (ring_buffer_flags != RB_BUFFERS_ON) 2476 if (ring_buffer_flags != RB_BUFFERS_ON)
2499 return -EBUSY; 2477 return -EBUSY;
2500 2478
2501 resched = ftrace_preempt_disable(); 2479 preempt_disable_notrace();
2502 2480
2503 if (atomic_read(&buffer->record_disabled)) 2481 if (atomic_read(&buffer->record_disabled))
2504 goto out; 2482 goto out;
@@ -2528,7 +2506,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
2528 2506
2529 ret = 0; 2507 ret = 0;
2530 out: 2508 out:
2531 ftrace_preempt_enable(resched); 2509 preempt_enable_notrace();
2532 2510
2533 return ret; 2511 return ret;
2534} 2512}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 55e48511d7c8..35727140f4fb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1404,7 +1404,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1404 struct bprint_entry *entry; 1404 struct bprint_entry *entry;
1405 unsigned long flags; 1405 unsigned long flags;
1406 int disable; 1406 int disable;
1407 int resched;
1408 int cpu, len = 0, size, pc; 1407 int cpu, len = 0, size, pc;
1409 1408
1410 if (unlikely(tracing_selftest_running || tracing_disabled)) 1409 if (unlikely(tracing_selftest_running || tracing_disabled))
@@ -1414,7 +1413,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1414 pause_graph_tracing(); 1413 pause_graph_tracing();
1415 1414
1416 pc = preempt_count(); 1415 pc = preempt_count();
1417 resched = ftrace_preempt_disable(); 1416 preempt_disable_notrace();
1418 cpu = raw_smp_processor_id(); 1417 cpu = raw_smp_processor_id();
1419 data = tr->data[cpu]; 1418 data = tr->data[cpu];
1420 1419
@@ -1452,7 +1451,7 @@ out_unlock:
1452 1451
1453out: 1452out:
1454 atomic_dec_return(&data->disabled); 1453 atomic_dec_return(&data->disabled);
1455 ftrace_preempt_enable(resched); 1454 preempt_enable_notrace();
1456 unpause_graph_tracing(); 1455 unpause_graph_tracing();
1457 1456
1458 return len; 1457 return len;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2cd96399463f..6c45e55097ce 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -628,54 +628,6 @@ enum trace_iterator_flags {
628 628
629extern struct tracer nop_trace; 629extern struct tracer nop_trace;
630 630
631/**
632 * ftrace_preempt_disable - disable preemption scheduler safe
633 *
634 * When tracing can happen inside the scheduler, there exists
635 * cases that the tracing might happen before the need_resched
636 * flag is checked. If this happens and the tracer calls
637 * preempt_enable (after a disable), a schedule might take place
638 * causing an infinite recursion.
639 *
640 * To prevent this, we read the need_resched flag before
641 * disabling preemption. When we want to enable preemption we
642 * check the flag, if it is set, then we call preempt_enable_no_resched.
643 * Otherwise, we call preempt_enable.
644 *
645 * The rational for doing the above is that if need_resched is set
646 * and we have yet to reschedule, we are either in an atomic location
647 * (where we do not need to check for scheduling) or we are inside
648 * the scheduler and do not want to resched.
649 */
650static inline int ftrace_preempt_disable(void)
651{
652 int resched;
653
654 resched = need_resched();
655 preempt_disable_notrace();
656
657 return resched;
658}
659
660/**
661 * ftrace_preempt_enable - enable preemption scheduler safe
662 * @resched: the return value from ftrace_preempt_disable
663 *
664 * This is a scheduler safe way to enable preemption and not miss
665 * any preemption checks. The disabled saved the state of preemption.
666 * If resched is set, then we are either inside an atomic or
667 * are inside the scheduler (we would have already scheduled
668 * otherwise). In this case, we do not want to call normal
669 * preempt_enable, but preempt_enable_no_resched instead.
670 */
671static inline void ftrace_preempt_enable(int resched)
672{
673 if (resched)
674 preempt_enable_no_resched_notrace();
675 else
676 preempt_enable_notrace();
677}
678
679#ifdef CONFIG_BRANCH_TRACER 631#ifdef CONFIG_BRANCH_TRACER
680extern int enable_branch_tracing(struct trace_array *tr); 632extern int enable_branch_tracing(struct trace_array *tr);
681extern void disable_branch_tracing(void); 633extern void disable_branch_tracing(void);
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 9d589d8dcd1a..52fda6c04ac3 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -32,16 +32,15 @@
32u64 notrace trace_clock_local(void) 32u64 notrace trace_clock_local(void)
33{ 33{
34 u64 clock; 34 u64 clock;
35 int resched;
36 35
37 /* 36 /*
38 * sched_clock() is an architecture implemented, fast, scalable, 37 * sched_clock() is an architecture implemented, fast, scalable,
39 * lockless clock. It is not guaranteed to be coherent across 38 * lockless clock. It is not guaranteed to be coherent across
40 * CPUs, nor across CPU idle events. 39 * CPUs, nor across CPU idle events.
41 */ 40 */
42 resched = ftrace_preempt_disable(); 41 preempt_disable_notrace();
43 clock = sched_clock(); 42 clock = sched_clock();
44 ftrace_preempt_enable(resched); 43 preempt_enable_notrace();
45 44
46 return clock; 45 return clock;
47} 46}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 53cffc0b0801..a594f9a7ee3d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1524,12 +1524,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1524 struct ftrace_entry *entry; 1524 struct ftrace_entry *entry;
1525 unsigned long flags; 1525 unsigned long flags;
1526 long disabled; 1526 long disabled;
1527 int resched;
1528 int cpu; 1527 int cpu;
1529 int pc; 1528 int pc;
1530 1529
1531 pc = preempt_count(); 1530 pc = preempt_count();
1532 resched = ftrace_preempt_disable(); 1531 preempt_disable_notrace();
1533 cpu = raw_smp_processor_id(); 1532 cpu = raw_smp_processor_id();
1534 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); 1533 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1535 1534
@@ -1551,7 +1550,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1551 1550
1552 out: 1551 out:
1553 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); 1552 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1554 ftrace_preempt_enable(resched); 1553 preempt_enable_notrace();
1555} 1554}
1556 1555
1557static struct ftrace_ops trace_ops __initdata = 1556static struct ftrace_ops trace_ops __initdata =
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index b3f3776b0cd6..16aee4d44e8f 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
54 struct trace_array_cpu *data; 54 struct trace_array_cpu *data;
55 unsigned long flags; 55 unsigned long flags;
56 long disabled; 56 long disabled;
57 int cpu, resched; 57 int cpu;
58 int pc; 58 int pc;
59 59
60 if (unlikely(!ftrace_function_enabled)) 60 if (unlikely(!ftrace_function_enabled))
61 return; 61 return;
62 62
63 pc = preempt_count(); 63 pc = preempt_count();
64 resched = ftrace_preempt_disable(); 64 preempt_disable_notrace();
65 local_save_flags(flags); 65 local_save_flags(flags);
66 cpu = raw_smp_processor_id(); 66 cpu = raw_smp_processor_id();
67 data = tr->data[cpu]; 67 data = tr->data[cpu];
@@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
71 trace_function(tr, ip, parent_ip, flags, pc); 71 trace_function(tr, ip, parent_ip, flags, pc);
72 72
73 atomic_dec(&data->disabled); 73 atomic_dec(&data->disabled);
74 ftrace_preempt_enable(resched); 74 preempt_enable_notrace();
75} 75}
76 76
77static void 77static void
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0e73bc2ef8c5..c9fd5bd02036 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -46,7 +46,6 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
46 struct trace_array_cpu *data; 46 struct trace_array_cpu *data;
47 unsigned long flags; 47 unsigned long flags;
48 long disabled; 48 long disabled;
49 int resched;
50 int cpu; 49 int cpu;
51 int pc; 50 int pc;
52 51
@@ -54,7 +53,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
54 return; 53 return;
55 54
56 pc = preempt_count(); 55 pc = preempt_count();
57 resched = ftrace_preempt_disable(); 56 preempt_disable_notrace();
58 57
59 cpu = raw_smp_processor_id(); 58 cpu = raw_smp_processor_id();
60 if (cpu != wakeup_current_cpu) 59 if (cpu != wakeup_current_cpu)
@@ -74,7 +73,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
74 out: 73 out:
75 atomic_dec(&data->disabled); 74 atomic_dec(&data->disabled);
76 out_enable: 75 out_enable:
77 ftrace_preempt_enable(resched); 76 preempt_enable_notrace();
78} 77}
79 78
80static struct ftrace_ops trace_ops __read_mostly = 79static struct ftrace_ops trace_ops __read_mostly =
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index f4bc9b27de5f..056468eae7cf 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -110,12 +110,12 @@ static inline void check_stack(void)
110static void 110static void
111stack_trace_call(unsigned long ip, unsigned long parent_ip) 111stack_trace_call(unsigned long ip, unsigned long parent_ip)
112{ 112{
113 int cpu, resched; 113 int cpu;
114 114
115 if (unlikely(!ftrace_enabled || stack_trace_disabled)) 115 if (unlikely(!ftrace_enabled || stack_trace_disabled))
116 return; 116 return;
117 117
118 resched = ftrace_preempt_disable(); 118 preempt_disable_notrace();
119 119
120 cpu = raw_smp_processor_id(); 120 cpu = raw_smp_processor_id();
121 /* no atomic needed, we only modify this variable by this cpu */ 121 /* no atomic needed, we only modify this variable by this cpu */
@@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
127 out: 127 out:
128 per_cpu(trace_active, cpu)--; 128 per_cpu(trace_active, cpu)--;
129 /* prevent recursion in schedule */ 129 /* prevent recursion in schedule */
130 ftrace_preempt_enable(resched); 130 preempt_enable_notrace();
131} 131}
132 132
133static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =